A煤n no tienes acceso a esta clase

Crea una cuenta y contin煤a viendo este curso

Unificando el scraper

10/30
Recursos

Aportes 47

Preguntas 7

Ordenar por:

驴Quieres ver m谩s aportes, preguntas y respuestas de la comunidad? Crea una cuenta o inicia sesi贸n.

Que profesor tan genial!

les comparto el script, aunque podria mejorar mucho al separarlo en modulos como en las clases de david

import requests
from bs4 import BeautifulSoup
import pandas as pd


def  main(url):
    links_secciones = _obtener_secciones(url)
    notas = _obtener_notas(links_secciones)
    data = _obtener_data(notas)
    _save_data(data)

def _obtener_secciones(url):
    request = requests.get(url)
    
    if request.status_code == 200:
        soup = BeautifulSoup(request.text, 'html.parser')
        secciones = soup.find('ul', attrs={'class': 'hot-sections'}).find_all('li')
        links_secciones = [seccion.a.get('href') for seccion in secciones]

    return links_secciones


def _obtener_notas(links_secciones):
    notas = []
    for link in links_secciones:
        try:
            r = requests.get(link)
            if r.status_code == 200:
                soup = BeautifulSoup(r.text, 'html.parser')
                notas.extend(__obtener_urls_notas(soup))
            else:
                print('No se pudo obtener la seccion', link)
        except:
            print('No se pudo obtener la seccion', link)

    return notas


def __obtener_urls_notas(soup):
    '''
    Funcion que recibe un objeto de BeautifulSoup de una pagina 
    de una seccion y devuelve una lista de urls a las notas de esa seccion.
    '''
    lista_notas = []

    # Obtengo el articulo promocionado
    featured_article = soup.find(
        'div', attrs={'class': 'featured-article__container'})
    if featured_article:
        lista_notas.append(featured_article.a.get('href'))

    # Obtengo el listado de los articulos
    article_list = soup.find('ul', attrs={'class': 'article-list'})

    for article in article_list.find_all('li'):

        if article.a:
            lista_notas.append(article.a.get('href'))

    return list(set(lista_notas))


def _obtener_data(notas):
    data = []

    for i, nota in enumerate(notas):
        print(f'Scrapeando nota {i}/{len(notas)}')
        data.append(__scrape_nota(nota))
    return data


def __scrape_nota(url):
    try:
        nota = requests.get(url)
    except Excepton as e:
        print(f'Error scrapeando ULR {url}')
        print(e)
        return None

    if nota.status_code != 200:
        print('fError obteniendo nota {url}')
        print(f'Status code = {nota.status_code}')
        return None

    s_nota = BeautifulSoup(nota.text, 'html.parser')

    ret_dict = __obtener_info(s_nota)
    ret_dict['url'] = url

    return ret_dict


def __obtener_info(s_nota):
    # Creamos un diccionario vacio para probarlo con la informacion
    ret_dict = {}

    # Extraemos la fecha
    fecha = s_nota.find('span', attrs={'pubdate': 'pubdate'})
    if fecha:
        ret_dict['fecha'] = fecha.get('datetime')
    else:
        ret_dict['fecha'] = None

    # Extraemos el titulo
    titulo = s_nota.find('h1', attrs={'class': 'article-title'})
    if titulo:
        ret_dict['titulo'] = titulo.text
    else:
        ret_dict['titulo'] = None

    # Extraer la volanda
    volanta = s_nota.find('h2', attrs={'class': 'article-prefix'})
    if volanta:
        ret_dict['volanta'] = volanta.get_text()
    else:
        ret_dict['volanta'] = None

    # Extraer copete
    copete = s_nota.find('div', attrs={'class': 'article-summary'})
    if copete:
        ret_dict['copete'] = copete.get_text()
    else:
        ret_dict['copete'] = None

    # Extraer autor
    autor = s_nota.find('div', attrs={'class': 'article-author'})
    if autor:
        ret_dict['autor'] = autor.get_text()
    else:
        ret_dict['autor'] = None

    # Extraer imagen
    media = s_nota.find('div', attrs={'class': 'article-main-media'})
    if media:
        imagenes = media.find_all('img')
        if len(imagenes) == 0:
            print('no se encontraron imagenes')
        else:
            imagen = imagenes[-1]
            img_src = imagen.get('data-src')
            try:
                img_req = requests.get(img_src)
                if img_req.status_code == 200:
                    ret_dict['imagen'] = img_req.content
                else:
                    ret_dict['imagen'] = None
            except:
                print('No se pudo obtener la imagen')
    else:
        print('No se encontro media')

    # Extraerel  cuerpo
    cuerpo = s_nota.find('div', attrs={'class': 'article-text'})
    if cuerpo:
        ret_dict['cuerpo'] = cuerpo.get_text()
    else:
        ret_dict['cuerpo'] = None

    return ret_dict


def _save_data(data):
    df = pd.DataFrame(data)
    df.to_csv('Notas_pagina12.csv')
        
    return df


if __name__ == "__main__":
    url = 'https://www.pagina12.com.ar/'
    main(url)


Tuve bastantes problemas con el c贸digo, principalmente al manejar errores de NoneType, as铆 que primero tom茅 el curso de Introducci贸n a Web Scraping con Xpath y luego rehice el scraper de la p谩gina utilizando Xpath, lo hice todo en una funci贸n y fue muy satisfactorio verlo funcionar!

import requests
import lxml.html as html
import pandas as pd

HOME_URL = 'https://www.pagina12.com.ar/'

XPATH_HOMEPAGE_LINKS_TO_ARTICLES = '/html/body//div[@class="headline-content"]//a[not(@class)]/@href'
XPATH_HOMEPAGE_LINKS_TO_SECTIONS = '/html/body//ul[@class="hot-sections"]/li/a/@href'
XPATH_SECTION_LINKS_TO_ARTICLES = '/html/body//div[@class="article-box__container"]/h2/a/@href'

XPATH_TITLE = '/html/body//h1[@class="article-title"]/text()'
XPATH_SUMMARY = '/html/body//div[@class="article-summary"]/text()'
XPATH_BODY = '/html/body//div[@class="article-text"]/p//text()'
XPATH_PREFIX = '/html/body//h2[@class="article-prefix"]/text()'
XPATH_DATE = '/html/body//span[@pubdate="pubdate"]/text()'
XPATH_AUTHOR = '/html/body//div[@class="article-main-media-header"]/div[@class="article-author"]/span/a/text()'
def scrape_site():
    try:
        response = requests.get(HOME_URL)
        if response.status_code == 200:
            final_articles = []

            home = response.content.decode('utf-8')
            parsed = html.fromstring(home)
            links_to_news = parsed.xpath(XPATH_HOMEPAGE_LINKS_TO_ARTICLES)
            links_to_sections = parsed.xpath(XPATH_HOMEPAGE_LINKS_TO_SECTIONS)

            for link in links_to_sections:

                try:
                    section_response = requests.get(link)
                    section = section_response.content.decode('utf-8')
                    section_parsed = html.fromstring(section)

                    links_to_articles_in_section = section_parsed.xpath(XPATH_SECTION_LINKS_TO_ARTICLES)
                    for url in links_to_articles_in_section:
                        links_to_news.append(url)

                except ValueError as ve:
                    print(ve)

            for link in links_to_news:
                try:
                    article_response = requests.get(link)
                    article = article_response.content.decode('utf-8')
                    article_parsed = html.fromstring(article)
                    article_elements = {}

                    title = article_parsed.xpath(XPATH_TITLE)
                    if len(title):
                        article_elements['title'] = title[0]
                    else:
                        article_elements['title'] = None

                    summary = article_parsed.xpath(XPATH_SUMMARY)
                    if len(summary):
                        article_elements['summary'] = summary[0]
                    else:
                        article_elements['summary'] = None
                    body = article_parsed.xpath(XPATH_BODY)  
                    p_elements = []
                    for text in body:
                        if str(text)[0] in [',', '.']:
                            p_elements.append(str(text))
                        else:
                            p_elements.append(' ' + str(text))


                    body = ''.join(p_elements)
                    if len(body):
                        article_elements['body'] = body
                    else:
                        article_elements['body'] = None

                    prefix = article_parsed.xpath(XPATH_PREFIX)
                    if len(prefix):
                        article_elements['prefix'] = prefix[0]
                    else:
                        article_elements['prefix'] = None

                    date = article_parsed.xpath(XPATH_DATE)
                    if len(date):
                        article_elements['date'] = date[0]
                    else:
                        article_elements['date'] = None

                    author = article_parsed.xpath(XPATH_AUTHOR)
                    if len(author):
                        article_elements['author'] = author[0]
                    else:
                        article_elements['author'] = None


                    final_articles.append(article_elements)
                    '''
                    XPATH_TITLE = '/html/body//h1[@class="article-title"]/text()'
                    XPATH_SUMMARY = '/html/body//div[@class="article-summary"]/text()'
                    XPATH_BODY = '/html/body//div[@class="article-text"]/p//text()'
                    XPATH_PREFIX = '/html/body//h2[@class="article-prefix"]/text()'
                    XPATH_DATE = '/html/body//span[@pubdate="pubdate"]/text()
                    XPATH_AUTHOR = '/html/body//div[@class="article-main-media-header"]/div[@class="article-author"]/span/a/text()'
                    '''

                except Exception as e:
                    print(e)


            return final_articles

        else:
            print(f'Error. Status code {response.status_code}')


    except ValueError as ve:
        print(ve)

No olviden que este Scrapper solo est谩 extrayendo contenido de la primera p谩gina por cada secci贸n. Pueden automatizarlo para que recorra todas las p谩ginas, guiense del boton Siguiente P谩gina para obtener las nuevas url鈥檚:

while btn_next:
	#Extraer informaci贸n de art铆culos

	link_next	=	bshot.find('a', attrs={'class':'pagination-btn-next'}).get('href')
	if link_next is not None:
		newurl			=	url + link_next
		request_page 	=	requests.get(newurl)
		bshot			=	BeautifulSoup(req_hot.text, 'lxml')
	else: btn_next = False

Por ejemplo, solo para la secci贸n El Pa铆s, a la fecha hay 2153 p谩ginas, lo que equivale aproximadamente a 30鈥000 art铆culos solo en esta secci贸n.

Para el tema de como crear un web scrapper automatizado igual recomiendo ampliamente el curso de ingenieria de datos con python, donde el proyecto es precisamente crear un web scrapper, adem谩s de pruebas un jupyter notebook se crea un proyecto automatizado.

Minuto 02:23 donde dice #extraemos el copete鈥

ret_dict[鈥榗opete鈥橾 = volanta.get.text()
debe decir
ret_dict[鈥榗opete鈥橾 = copete.get.text()

Lo unico que veo malo, que el profesor no comento ni si quiera que el scrapper trae informacion que no se necesita, como 鈥淺n鈥 saltos de lineas, que coloca el html automaticamente, no dijo nada sobre limpiar la informacion antes de guardarla en un archivo, que me parece de real importacion.

Para los que esten viendo este curso, les recomiendo que vean antes el curso de ingenieria de datos con python, el profesor te explica todo muy bien todo y asi vas a evitar almacenar caracteres no deseados

Les adjunto el proyecto como me quedo. Oct 2020

import requests
from bs4 import BeautifulSoup
from IPython.display import Image
import pandas as pd


def obtener_notas(soup):
    '''
    Funci贸n que recibe un objeto de BeautifulSoup de una p谩gina de una secci贸n
    y devuelve una lista de URLs a las notas de esa secci贸n
    '''
    lista_notas = []
    
    articles = soup.find_all('div', attrs={'class':'article-item__content'})
    for article in articles:
        if article.a: #Aca se verifica que lo q retorna find no sea none
            lista_notas.append(article.a.get('href'))
    return lista_notas


def get_list_news(urls):
    try:
        s12 = requests.get(urls)
        if s12.status_code == 200:
            soup  = BeautifulSoup(s12.text, 'lxml')
            lista = obtener_notas(soup)
            return lista
        else:
            print(f'La url entrego un status code de {s12.status_code}')
    except Exception as e:
        print('Error en la request')
        print(e)


def extraer_imagen_principal(s_nota):
    # Extraer imagen principal de la nota
    try:
        media = s_nota.find('div', attrs = {'class':'article-main-media-image'})
        imagenes = media.find_all('img')

        if len(imagenes) == 0:
            print('no se encontraron imagenes')
            raise Exception  
        else:
            imagen = imagenes[-1] #El -1 en python indicamos que es la ultima, a indexar de atras para adelante. En este caso la ultima es la mas grande
            img_src = imagen.get('data-src')
            return img_src

    except Exception as e:
        print(e)
        img_src = ''
        return img_src


def scrap_article(url_nota):
    try:
        nota = requests.get(url_nota)
        if nota.status_code == 200:
            s_nota = BeautifulSoup(nota.text, 'lxml')
            
            # La volanta
            try:
                volanta_nota = s_nota.find('div', attrs = {'class':'article-titles'}).h2
                volanta = volanta_nota.text
            except:
                volanta = ''
                pass

            # Extraemos el titulo
            try:
                titulo_nota = s_nota.find('div', attrs = {'class':'article-titles'}).h1
                titulo = titulo_nota.text
            except:
                titulo = ''
                pass
                
            # Extraemos la fecha
            try:
                fecha_nota = s_nota.find('div', attrs = {'class':'time'}).span.get('datetime')
            except:
                fecha_nota = ''
                pass

            # Imagen
            img = extraer_imagen_principal(s_nota)

            # Copete
            try:
                copete_nota = s_nota.find('div', attrs = {'class':'article-summary'})
                copete = copete_nota.text
            except:
                copete = ''
                pass

            # El cuerpo
            try:
                cuerpo_nota = s_nota.find('div', attrs = {'class':'article-text'}).find_all('p')
                cuerpo = ''
                for paragraph in cuerpo_nota:
                        cuerpo = cuerpo + ' ' + paragraph.text
            except:
                cuerpo = ''
                pass
            
            return volanta, titulo, fecha_nota, img, copete, cuerpo

        else:
            print(f'La url entrego un status code de {nota.status_code}')
            raise Exception
    except Exception as e:
        print('Error:')
        print(e)
        print('\n')


def obtener_urls_secciones(url):
    p12 = requests.get(url)
    s = BeautifulSoup(p12.text, 'lxml')
    if p12.status_code == 200:
        secciones = s.find('ul', attrs = {'class':'horizontal-list main-sections hide-on-dropdown'}).find_all('li')
        links_secciones = [seccion.a.get('href') for seccion in secciones]
        return links_secciones
    else:
        pass


def main(url):

    links_secciones = obtener_urls_secciones(url)

    fechas = []
    volantas = []
    titulos = []
    imgs = []
    copetes = []
    cuerpos = []
    urls_news = [] 

    for urls in links_secciones:
        lista = get_list_news(urls)

        for url in lista:
            [volanta_nota, titulo_nota, fecha_nota, img, copete_nota, cuerpo] = scrap_article(url)
            
            urls_news.append(url)
            volantas.append(volanta_nota)
            titulos.append(titulo_nota)
            fechas.append(fecha_nota)
            imgs.append(img)
            copetes.append(copete_nota)
            cuerpos.append(cuerpo)

    web_scrap = {'urls':urls_news, 'fechas':fechas, 'titulos':titulos, 'volantas':volantas, 'imgs': imgs, 'copetes':copetes, 'cuerpos':cuerpos}
        
    df = pd.DataFrame(web_scrap)
        
    return df


if __name__ == '__main__':

    url = 'https://www.pagina12.com.ar/'
    df = main(url)

    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_colwidth', 30)
    print('\n')
    print(df)

    df.to_csv('Notas-Pagina12.csv')
    df.to_excel('Notas-V-Pagina12.xlsx', sheet_name='Hoja_1')

Result贸 todo! En mi caso explor茅 varias alternativas de exportaci贸n con el csv y no me result贸 ninguna. Finalmente export茅 a excel y result贸 de inmediato. Por si alguien se encuentra en el caso lo puede evaluar.

df2.to_excel('Notas-V Pagina12.xlsx', sheet_name='Hoja_1')

En la funci贸n obtener_info() se debe corregir el s_nota.find para el t铆tulo. En vez de 鈥榙iv鈥, se pone 鈥榟1鈥

titulo = s_nota.find('h1', attrs={'class':'article-title'})
    if titulo:
        ret_dict['titulo'] = titulo.text
    else:
        ret_dict['titulo'] = None

Con 鈥榙iv鈥 se obtienen los t铆tulos de otros art铆culos que est谩n en la parte lateral, pero no el t铆tulo del art铆culo en cuesti贸n.

Me perdi en que momento se hizo la funcion obtener_info y me arroja un error, en que minuto aparece?

Ya que sabemos hacerlo desde 0, miren esta herramienta que nos puede ayudar muchisimo

https://www.youtube.com/watch?v=n7fob_XVsbY

Muy interesante, me ha gustado la metodologia explicado paso a paso y como encontrar los div de cada uno de los elementos a scrapear

Este programa baja 260 articulos (se demora un poco), lo hice de forma que guardara la seccion creando una clase: articulo y despues manipulandolo. Tambien guarde la seccion del articulo.

Si baja la imagen pero yo trabajo con la consola entonces no la inserte en en el archivo, lo corri en jupyter notebook e hice la validacion que hace en profesor en el curso y si muestra la imagen.

aca esta el codigo.

<import requests
from bs4 import BeautifulSoup
import lxml.html as html
from IPython.display import Image,display
import pandas as pd

class articulo:
    def __init__(self, seccion, url,fecha,titulo,rese帽a,resumen,imagen_src):
        self.seccion=seccion
        self.url=url
        self.fecha=fecha
        self.titulo=titulo
        self.rese帽a=rese帽a
        self.resumen=resumen
        self.imagen_src=imagen_src
        
    def print_articulo(self):
        print(self.seccion)
        print(self.url)
        print(self.fecha)
        print(self.rese帽a)
        print(self.titulo)
        print(self.resumen)
    
    def convertir_a_diccionario(self):
        return {'seccion': self.seccion, 'url': self.url, 'fecha': self.fecha,'titulo':self.titulo,'rese帽a':self.rese帽a,'resumen':self.resumen}                       
        
        

def list_new_secciones(links):
    news=BeautifulSoup(links.text,'html.parser')
    bajadas=news.find_all('div', attrs={'class':'article-item__content'})
    links_h2_noticias=[f.h2.a.get('href') for f in bajadas if f.h2]
    links_h3_noticias=[f.h3.a.get('href') for f in bajadas if f.h3]
    links_h4_noticias=[f.h4.a.get('href') for f in bajadas if f.h4]
    return links_h2_noticias +links_h3_noticias+links_h4_noticias

def traer_fecha(articulo_url):
    try:
        s_articulo=requests.get(articulo_url)
        if s_articulo.status_code== 200:
            s_articulo2=BeautifulSoup(s_articulo.text,'html.parser')
            fecha=s_articulo2.find('span', attrs={'pubdate':'pubdate'})
            date=fecha.get('datetime')
        else:
            date=None
    except  ValueError as ve:
            print("Hubo un error en la request")
            print(ve)
            print("\n")    
    return date

def traer_titulo(articulo_url):
    try:
        s_articulo=requests.get(articulo_url)
        if s_articulo.status_code== 200:
            s_articulo2=BeautifulSoup(s_articulo.text,'html.parser')
            titulo=s_articulo2.find('h1', attrs={'class':'article-title'})
            title=titulo.get_text()
        else:
            title=None
    except  ValueError as ve:
            print("Hubo un error en la request")
            print(ve)
            print("\n")    
    return title

def traer_resea(articulo_url):
    try:
        s_articulo=requests.get(articulo_url)
        if s_articulo.status_code== 200:
            s_articulo2=BeautifulSoup(s_articulo.text,'html.parser')
            rese帽a=s_articulo2.find('h2', attrs={'class':'article-prefix'})
            if rese帽a:
                 res=rese帽a.get_text()
            else:
                res=None
    except  ValueError as ve:
            print("Hubo un error en la request")
            print(ve)
            print("\n")    
    return res

def traer_imagen(articulo_url):
    try:
        s_articulo=requests.get(articulo_url)
        if s_articulo.status_code== 200:
            s_articulo2=BeautifulSoup(s_articulo.text,'html.parser')
            media=s_articulo2.find('div', attrs={'class':'article-main-media-image'})
            if media:
                pics=media.find_all(('img'))
                if(len(pics)==0):
                     pic_source=None
                else:
                    pic=pics[-1]
                    pic_source=pic.get('data-src')
                    pic_req=requests.get(pic_source)
                    if pic_req.status_code==200:
                        article_pic=pic_req.content
            else:
                article_pic=None
    except  ValueError as ve:
            print("Hubo un error en la request")
            print(ve)
            print("\n")    
    return article_pic

def traer_resumen(articulo_url):
    try:
        s_articulo=requests.get(articulo_url)
        if s_articulo.status_code== 200:
            s_articulo2=BeautifulSoup(s_articulo.text,'html.parser')
            resume=s_articulo2.find('div', attrs={'class':'article-summary'})
            if resume:
                 rsume=resume.get_text()
            else:
                rsume=None
    except  ValueError as ve:
            print("Hubo un error en la request")
            print(ve)
            print("\n")    
    return rsume


def run():
    url= 'https://www.pagina12.com.ar/'
    p12= requests.get(url)
    if p12.status_code== 200:
        s=BeautifulSoup(p12.text,'html.parser')
        secciones=s.find('ul', attrs={'class':'horizontal-list main-sections hide-on-dropdown'}).find_all('li')
        links_secciones=[seccion.a.get('href') for seccion in secciones]
        lista_de_articulos=[]
        sec=[]
        s=[]
        i=0
        n=len(links_secciones)      
        while i<n:
            a=secciones[i].a.get_text()
            sec=links_secciones[i]
            try:
                s=s+list_new_secciones(requests.get(sec))
                for art in s:
                    lista_de_articulos.append(articulo(a,art,'','','','',''))                
            except  ValueError as ve:
                print("Hubo un error en la request")
                print(ve)
                print("\n")        
            i=i+1
        for obj in lista_de_articulos:
            a_url=obj.url
            obj.fecha= traer_fecha(a_url)
            obj.titulo=traer_titulo(a_url)
            obj.rese帽a=traer_rese帽a(a_url)
            obj.resumen=traer_resumen(a_url)
            obj.imagen_src=traer_imagen(a_url)
        
        #img=(Image(lista_de_articulos[0].imagen_src))
        arts={}
        L_arts=[]
        for a in lista_de_articulos:
            arts=a.convertir_a_diccionario()
            L_arts.append(arts)
        df=pd.DataFrame(L_arts)
        df.head()
        df.to_csv('articulos Pagina 12.csv',encoding='utf-8-sig')

if __name__=='__main__':
    run()>

Tambien dejo el repositorio de GitHub aca en este link.

como tip les recomiendo al trabajar no usar todas las secciones de la pagina, solo ir probando con la primera, manipulando esta seccion del codigo:

< while i<n:
            a=secciones[i].a.get_text()
            sec=links_secciones[i]>

cambien 鈥渋<n鈥 por 鈥渋<1鈥 y asi al correrlo no demora tanto.

Casi no llego hasta ac谩鈥 estuve a punto de no continuar con el curso. El profe explica muy bien pero muchas cosas no entend铆a y siempre ten铆a un error de algo o simplemente no me daba. Casi dos semanas, pero lo logre entender y hacer los retos. 馃挭馃徎

Me ha encantado el curso!!!

Genial!!! Muy bien explicado!! 馃槃


AttributeError Traceback (most recent call last)
<ipython-input-160-ef5ea79c4215> in <module>
2 for i, nota in enumerate(notas):
3 print(f鈥橲crapeando nota{i}/{len(notas)}鈥)
----> 4 data.append(scrape_nota(nota))

<ipython-input-150-f7235a0b2bf8> in scrape_nota(url)
14 s_nota = BeautifulSoup(nota.text, 鈥榣xml鈥)
15
鈥> 16 ret_dict = obtener_info(s_nota)
17 ret_dict[鈥榰rl鈥橾 = url
18

<ipython-input-149-326501bcc8c6> in obtener_info(s_nota)
28 copete = s_nota.find(鈥榙iv鈥, attrs={鈥榗lass鈥:鈥榓rticle-summary鈥檥)
29 if copete:
鈥> 30 ret_dict[鈥榗opete鈥橾 = volanta.get_text()
31 else:
32 ret_dict[鈥榗opete鈥橾 = None

AttributeError: 鈥楴oneType鈥 object has no attribute 鈥榞et_text鈥

Muy bueno este proyecto

Por si a alguien mas le pasa estando en Jupyter鈥
al pedir la fecha en la funcion de obtener_info, no hace falta poner fecha.get(鈥榙atetime鈥), porque no te la coge鈥 Con poner:
dict['fecha] = fecha
ya funciona

justamente esa era mi pregunta acerca de la imagen de haces dos clases鈥

Primera seccion estupenda!

Excelente curso una vez mas. Lo he repetido una y otra vez y lo encuentro estupento. Gracias! 馃槂

Para el t铆tulo y la volanta las cosas han cambiado y para que no les traiga siempre el mismo t铆tulo y volanta con el c贸digo del GitHub, reempl谩cen las l铆neas en al funci贸n scrap_nota(url) con lo siguiente:

 # Extraigo el titulo
    try:
        ret_dict['titulo'] = s_nota.find('div',attrs={'class':'article-titles'}).find('h1').get_text()
    except:
        ret_dict['titulo'] = np.nan
    
    # Extraigo la volanta
    try:
        ret_dict['volanta'] = s_nota.find('div',attrs={'class':'article-titles'}).find('h2').get_text()
    except:
        ret_dict['volanta'] = np.nan

Comparto mi soluci贸n:

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json
import base64

url = 'https://www.pagina12.com.ar/'


def _get_section_links(url):
    print(f'Getting section links from {url}')

    section_links = []
    try:
        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            main_sections  = soup.find('ul', attrs={'class': 'main-sections'}).find_all('li')
            main_sections_links = [seccion.a.get('href') for seccion in main_sections]
            main_supplements  = soup.find('ul', attrs={'class': 'main-supplements'}).find_all('li')[1:]    
            main_supplements_links = [seccion.a.get('href') for seccion in main_supplements]
            section_links = main_sections_links + main_supplements_links
        else:
            print(f'Error while getting {url}')
            print(f'Status code: {response.status_code}')

    except Exception as e:
        print(f'Error while scrapping {url}')
        print(e)
    
    return section_links


def _get_article_links(section_link):
    print(f'Getting article links from {section_link}')

    article_links = []
    try:
        # Get section
        section_response = requests.get(section_link)
        if section_response.status_code == 200:
            soup = BeautifulSoup(section_response.text, 'html.parser')
            # Featured article
            featured_article = soup.find('div', attrs={'class':'article-item__content'})
            if featured_article:
                article_links.append(featured_article.a.get('href'))
            # Article list    
            article_group = soup.find_all('div', attrs={'class': 'articles-list'})
            for group in article_group:
                article_list = group.find_all('article', attrs={'class': 'article-item'})
                for article in article_list:
                    if article.a:
                        article_links.append(article.a.get('href'))
            # Join with base url
            if article_links:
                article_links = [urljoin(url, article_link) for article_link in article_links]

        else:
            print(f'Error while getting {section_link}')
            print(f'Status code: {section_response.status_code}')

    except Exception as e:
        print(f'Error while scrapping {section_link}')
        print(e)

    return article_links


def _get_article_data(article_link):
    print(f'Getting article data from {article_link}')

    article_data = {}
    try:
        article = requests.get(article_link)
        if article.status_code == 200:
            soup = BeautifulSoup(article.text, 'html.parser')
            # prefix        
            prefix = soup.find('div', attrs={'class': 'col 2-col'}).h4.text
            # title
            title = soup.find('div', attrs={'class': 'col 2-col'}).h1.text
            # summary
            summary = soup.find('div', attrs={'class': 'col 2-col'}).h3.text
            # date_time
            date_time = soup.find('time').get('datetime')
            # author
            author = soup.find('div', attrs={'class': 'author-name'})
            if author:
                author = author.text
            # body
            body = ''
            paragraphs = soup.find('div', attrs={'class': 'article-main-content'}).find_all('p')
            for paragraph in paragraphs:
                if paragraph.text:
                    body += paragraph.text
            # image
            images = soup.find('div', attrs={'class': 'article-main-media-image__container'}).find_all('img')
            if len(images) == 0:
                print('No images found')
            else:
                image = images[-1]
                img_src = image.get('src')
                try:
                    img_response = requests.get(img_src)
                    if img_response.status_code == 200:
                        img = base64.encodebytes(img_response.content).decode('utf-8')
                except Exception as e:
                    print('Could not obtain the image content')
                    print(e)

            article_data = {
                'prefix': prefix,
                'title': title,
                'summary': summary,
                'author': author,
                'datetime': date_time,
                'img': img,
                'body': body,
                'article_url': article_link
            }

        else:
            print(f'Error scrapping {article_link}')
            print(f'Status code: {article.status_code}')

    except Exception as e:
        print(f'Error scrapping {article_link}')
        print(e)
        print('\n')

    return article_data


def _export_data(data, filename):
    print(f'Exporting data to {filename}')
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(json.dumps(data, indent=4))


def main():
    print(f'Scrapping {url}')

    articles = []
    section_links = _get_section_links(url)
    for section_link in section_links:
        article_links = _get_article_links(section_link)
        for article_link in article_links:
            article_data = _get_article_data(article_link)
            article_data['section_url'] = section_link
            articles.append(article_data)

    if articles:
        json_data = {
            'url': url,
            'articles': articles
        }
        _export_data(json_data, 'data.json')


if __name__ == '__main__':
    main()

Mi c贸digo al 26/04/2002

import requests
import lxml
import re
import pandas as pd
from bs4 import BeautifulSoup


def build_link(url, link):
    """Build the links if it's not complete."""
    is_well_formed_link = re.compile(r'^https?://.+/.+$')
    is_root_path = re.compile(r'^/.+$')
    
    if is_well_formed_link.match(link):
        return link
    elif is_root_path.match(link):
        return f'{url}{link}'
    else:
        return f'{url}/{link}'


def clean_text(text):
    return text.replace("\xa0", "").replace("\n", "")


def get_response(url):
    try:
        return requests.get(url)
    except Exception as e:
        print("Error at getting the response: ", e)


def get_soup(response, parser='lxml'):
    # Return a Soup object
    return BeautifulSoup(response.text, parser)


def get_sections(url):
    """Returns a list of sections. Ex: (El pa铆s, Econom铆a, Sociedad, etc...)"""
    try:
        sections_response = get_response(url)
        
        if sections_response.status_code == 200:
            sections_soup = get_soup(sections_response)
            # Every section is inside a il that is inside a ul. (Find) only returns the first tag
            sections = sections_soup.find('ul', attrs={'class':'horizontal-list main-sections hide-on-dropdown'}).find_all('li')         
            if sections:               
                return [section.a.get('href') for section in sections]
            else:
                return "No sections."
        else:
            return "Error sections respone != 200"
    except Exception as e:
        print("Error getting the sections: ", e)


def get_articles_links(soup):

    articles_links = []
    
    # Get the featured article of the section 
    # by searching the FIRST div with class = article-item__content
    featured_article = soup.find('div', attrs={'class':'article-item__content'})
    
    if featured_article:
        featured_article_link = build_link('https://www.pagina12.com.ar', featured_article.a.get('href'))
        articles_links.append(featured_article_link)
    
    # The remaingin articles are in different group of divs
    article_groups = soup.find_all('div', attrs={'class':'articles-list'})
    
    for group in article_groups:
        # All the articles of the group
        articles = group.find_all('article', attrs={'class':'article-item'})
        # Loop for every article
        for article in articles:
            # The div that has the article inside
            div_of_article = article.find('div', attrs={'class':'article-item__content-footer-wrapper'})
            article_link = build_link('https://www.pagina12.com.ar', div_of_article.a.get('href'))
            articles_links.append(article_link)
    
    # Special case for the 'cultura y espectaculos' section
    if not articles_links:
        # All the articles
        articles = soup.find_all('div', attrs={'class':'article-box__container'})
        # Loop for every article
        for article in articles:
            article_link = article.h2
            article_link = build_link('https://www.pagina12.com.ar', article_link.a.get('href'))
            articles_links.append(article_link)        
    
    # Returns a list
    return articles_links


def get_all_articles_urls(sections_urls):    
    all_articles_urls = []
    
    for i, section_url in enumerate(sections_urls):
        print(f"\n- Scrapping section {i+1}/{len(sections_urls)}")        
        section_response = get_response(section_url)
        
        if section_response.status_code == 200:
            print("  Section: ", section_url)
            print("  Response: ", section_response.status_code)           
            section_soup = get_soup(section_response)
            articles_links = get_articles_links(section_soup)
            all_articles_urls = all_articles_urls + articles_links          
        else:
            print("Error status code of section: ", section_response.status_code)

    return all_articles_urls   


def scrape_article(article_link):          
    article_response = get_response(article_link)

    if article_response.status_code == 200:
        article_soup = get_soup(article_response)       
        title = get_article_title(article_soup)
        #print("Title: ", title)  
        date = get_article_date(article_soup)
        #print("Date: ", date)
        author = get_article_author(article_soup)
        #print("Suthor: ", author)
        volanta = get_article_volanta(article_soup)
        #print("Volanta: ", volanta)
        copete = get_article_copete(article_soup)
        #print("Copete: ", copete[0:10])
        body = get_article_body(article_soup)
        #print("Body: ", body[0:10])        
        image = get_article_image(article_soup)
        #print("Image: ", image)   
        return title, date, author, volanta, copete, body, image
    else:
        print("Error status code at scrapping the article: ", article_response.status_code)


def scrape_articles(articles_links):
    articles_titles = []
    articles_dates = []
    articles_authors = []
    articles_volantas = []
    articles_copetes = [] 
    articles_bodys = []    
    articles_images = []
    
    for i, article_link in enumerate(articles_links):
        print(f"Scrapping article {i+1}/{len(articles_links)}")
        
        article_response = get_response(article_link)

        if article_response.status_code == 200:
            article_soup = get_soup(article_response)       
            title = get_article_title(article_soup)
            #print("Title: ", title)  
            date = get_article_date(article_soup)
            #print("Date: ", date)
            author = get_article_author(article_soup)
            #print("Suthor: ", author)
            volanta = get_article_volanta(article_soup)
            #print("Volanta: ", volanta)
            copete = get_article_copete(article_soup)
            #print("Copete: ", copete[0:10])
            body = get_article_body(article_soup)
            #print("Body: ", body[0:10])        
            image = get_article_image(article_soup)
            #print("Image: ", image)   

            articles_titles.append(title)
            articles_dates.append(date)
            articles_authors.append(author)
            articles_volantas.append(volanta)
            articles_copetes.append(copete)
            articles_bodys.append(body) 
            articles_images.append(image)
            
        else:
            print("Error status code at scrapping the article: ", article_response.status_code)
    
    return articles_titles, articles_dates, articles_authors, articles_volantas, articles_copetes, articles_bodys, articles_images


def get_article_title(soup):
    try:
        title = soup.find('div', attrs={'class':'content'})
        if title:
            title = title.h1.text
            title = clean_text(title)
            return title
        else:
            return "No title."
    except Exception as e:
        print("Error getting the article title: ", e)


def get_article_date(soup):
    try:
        date = soup.find('div', attrs={'class':'date modification-date'}).span.time.text
        if date:
            return date
        else:
            return "No date."
    except Exception as e:
        print("Error getting the article date: ", e)    


def get_article_author(soup):
    try:
        author = soup.find('div', attrs={'class':'author-name'})
        if author:
            author = clean_text(author.text)
            return author
        else:
            return "No author."
    except Exception as e:
        print("Error getting the article author: ", e)  


def get_article_volanta(soup):
    try:
        volanta = soup.find('div', attrs={'class':'content'}).h4.text
        volanta = clean_text(volanta)
        if volanta:
            return volanta
        else:
            return "No volanta."
    except Exception as e:
        print("Error getting the article volanta: ", e)        


def get_article_copete(soup):
    try:
        copete = soup.find('div', attrs={'class':'content'}).h3.text
        copete = clean_text(copete)
        if copete:
            return copete
        else:
            return "No copete."
    except Exception as e:
        print("Error getting the article copete: ", e)        


def get_article_body(soup):
    try:
        body = soup.find('div', attrs={'class':'article-main-content article-text'})
        body_text = ""
        if body:
            body = body.find_all('p')
            for p in body:
                body_text = body_text + p.text
            body_text = clean_text(body_text)
            if body_text != "":
                return body_text
            else:
                return "No body."
        else:
            return "No body."
    except Exception as e:
        print("Error getting the article body: ", e)  


def get_article_image(soup):
    try:
        image = soup.find('div', attrs={'class','image-wrapper'})
        if image:
            image = image.img.get('src')
            return image
        else:
            return "No image."
    except Exception as e:
        print("Error getting the article image: ", e)  


def clean_text(text):
    return text.replace("\xa0", "").replace("\n", "")


def save_data(data):
    df = pd.DataFrame(data)
    df.to_csv('Notas_pagina12.csv')
    print("Dataframe was saved to csv.")
    return df
def run():
    
    URL = "https://www.pagina12.com.ar"

    sections = get_sections(URL)
    all_articles_urls = get_all_articles_urls(sections)    
    
    # To store the data of all articles
    articles_titles, articles_dates, articles_authors, articles_volantas, articles_copetes, articles_bodys, articles_images = scrape_articles(all_articles_urls)

    # dictionary of lists 
    data = {'url':all_articles_urls , 'title': articles_titles,
            'date': articles_dates, 'author': articles_authors,
            'volanta': articles_volantas, 'copete': articles_copetes, 
            'body': articles_bodys, 'image': articles_images}
    
    save_data(data)
if __name__ == '__main__':
    run()

Hice un scraper para el top 250 pel铆culas, es un proyecto muy com煤n, aqu铆 les dejo el link, me bas茅 mucho de lo que vimos en estas clases

https://github.com/Jeanfabra/web-scraper-top-250-movies

7-12-21

def obtener_info(s_nota):

    #Creamos un diccionario vacio para poblarlo con la informacion
    ret_dict =  {}

    #extraemos la fecha
    fecha = s_nota.find('span', attrs={'class': 'p12-separator--right--gray'})
    if fecha:
        ret_dict['fecha'] = fecha.text
    else:
        ret_dict['fecha']= None
    
    # Extaemos el  titulo
    titulo = s_nota.find('div', attrs={'class': 'col 2-col'})
    if titulo:
        ret_dict['titulo'] = titulo.text
    else:
        ret_dict['titulo']= None

    # Extraemos la volanta
    volanta = s_nota.find('div', attrs={'class': 'col 2-col'})
    if volanta:
        ret_dict['volanta'] = volanta.h4.text
    else:
        ret_dict['volanta']= None
    
    #Extraer copete
    copete = s_nota.find('div', attrs={'class': 'col 2-col'})
    if copete:
        ret_dict['copete'] = copete.h3.text
    else:
        ret_dict['copete']= None

    #Extraer autor
    autor = s_nota.find('div', attrs={'class': 'author-name'})
    if autor:
        ret_dict['autor'] = autor.text
    else:
        ret_dict['autor'] =  None
    
    #extraemos imagen
    media = s_nota.find('div', attrs={'class': 'article-main-media-image__container'})
    if  media:
        imagenes= media.find_all('img')
        if len(imagenes) == 0:
            print('No se encontraron imagenes')
        else:
            imagen = imagenes[-1]
            img_src = imagen.get('src')
            try:
                img_req= requests.get(img_src)
                if img_req.status_code ==200:
                    ret_dict['imagen']=Image(img_req.content) 
                else:
                    ret_dict['imagen'] = None

            except:
                print('No se encontro imagen')
    else:
        print('No se encontro media')
    
    # Extraemos el cuerpo de la nota
    cuerpo = s_nota.find('div', attrs={'class': 'article-main-content article-text'})
    if cuerpo:
        ret_dict['cuerpo'] = cuerpo.text
    else:
        ret_dict['cuerpo']= None
                
                

    

    

    return  ret_dict

Al scrapear los links, quedan incompletos:

Y esto hace que salga un error aqu铆:

隆Genial!

Muy espectacular, bastante bueno este modulo

Mi aporte, algo sucio, al 22-junio-2021

Al realizar unos ajustes para obtener el autor y asignar un None en las noticias sin autor:

# Funci贸n para scrapear una noticia
def News_scraper(link):
    from IPython.display import Image
    try:
        nota = requests.get(link)
    except Exception as e:
        print('Error: ', e)
    if nota.status_code == 200:
        
        s_nota = BeautifulSoup(nota.text, 'lxml')

        ret_dict = {} # Diccionario para almacenar los datos de la noticia

        # Extraer el t铆tulo
        try:
            titulo = s_nota.find('div', attrs={'class': 'col 2-col'}).h1.get_text()
        except Exception as e:
            print(f'Error en el t铆tulo de {link}')

        if titulo:
            ret_dict['titulo'] = titulo
        else:
            ret_dict['titulo'] = None

        # Extraer el autor de la noticia
        try:
            autor = s_nota.find('div', attrs={'class':'author-name'}).get_text()
            autor_x = autor.replace('Por ', '')
            if autor:
                ret_dict['autor'] = autor_x 
            else:
                ret_dict['autor'] = None
        except Exception as e:
            print(f'Error en el autor de {link}')
            ret_dict['autor'] = None

        # Extraer la fecha
        try:
            fecha = s_nota.find('span', attrs={'pubdate':'pubdate'}).get('datetime')
        except Exception as e:
            print(f'Error en la fecha de {link}')

        if fecha:
            ret_dict['fecha'] = fecha
        else:
            ret_dict['fecha'] = None

        # Extraer el copete o bajada
        try:
            copete = s_nota.find('div', attrs={'class': 'col 2-col'}).h3.get_text()
        except Exception as e:
            print(f'Error en el copete de {link}')
            
        if copete:
            ret_dict['copete'] = copete
        else:
            ret_dict['copete'] = None

        # Extraer la volanta
        try:
            volanta = s_nota.find('div', attrs={'class': 'col 2-col'}).h4.get_text()
        except Exception as e:
            print(f'Error en la volanta de {link}')
        
        if volanta:
            ret_dict['volanta'] = volanta
        else:
            ret_dict['volanta'] = None

        # Extraer el cuerpo de la noticia
        try:
            cuerpo = s_nota.find('div', attrs={'class': 'article-main-content article-text'}).find_all('p')
            cuerpo_text = [parrafo.getText() for parrafo in cuerpo]
        except Exception as e:
            print(f'Error en el cuerpo de {link}')
            
        if cuerpo_text:
            ret_dict['cuerpo'] = cuerpo_text
        else:
            ret_dict['cuerpo'] = None
        
        try:
            imagenes = s_nota.find('div', attrs={'class': 'image-wrapper'}).find_all('img')
        except Exception as e:
            print(f'Error en la im谩gen de {link}')

        if len(imagenes) == 0:
            print('No se encontraron im谩genes')
        else:
            imagen = imagenes[-1]
            img_src = imagen.get('data-src')

        try:
            img_req = requests.get(img_src)
            if img_req.status_code == 200:
                ret_dict['imagen'] = img_req.content
            else:
                print('URL de la im谩gen no encontrada, error: ', img_req.status_code)
                ret_dict['imagen'] = None
        except Exception as e3:
            print('Error: ', e3)

        ret_dict['url'] = link

        return ret_dict

Hola. Comparto mi c贸digo de esta secci贸n.
A tener en cuenta:

  • Utilice selectores CSS y el parser lo hice del HTML
  • Como buena pr谩ctica, cada funci贸n deber铆a ser separada en un modulo diferente
import requests
import bs4 
from requests.exceptions import HTTPError
import re
from IPython.display import display, Image
import datetime
import csv
import pandas as pd

def get_section(url):

    try:
        response = requests.get(url)
    except HTTPError:
        print('HTTP Error, web not available')

    response.encoding = 'utf-8'
    soup = bs4.BeautifulSoup(response.text, 'html.parser')

    sections = list(set(soup.select('ul.horizontal-list a')))
    
    category_links = []

    for i in range(len(sections)):
        category = sections[i]['href']
        category_links.append(category)
    
    link_generator(category_links)

def link_generator(sections):
    article_links = []

    for section in sections:
        try:
            response = requests.get(section)
        except HTTPError:
            print('HTTP Error, web not available')

        response.encoding = 'utf-8'
        soup = bs4.BeautifulSoup(response.text, 'html.parser')

        try: 
            if re.search('cultura', section):
                news = set(soup.select('.article-box__container a'))
                for item in news:
                    article_links.append(item['href'])
            else:
                news = set(soup.select('.title-list a'))
                for item in news:
                    article_links.append(item['href'])

        except Exception:
            print(f'Article not available: {item.text}')

    article(article_links)

def article(article_links):
    news = []

    for article in article_links:
        news.append(content(article))

    save_data(news)

def save_data(news):
    df = pd.DataFrame(news)
    now = datetime.datetime.now().strftime('%Y_%m_%d')
    df.to_csv(f'P谩gina12 Articles_{now}.csv')

def content(link):
    try:
        response = requests.get(link)
        response.encoding = 'utf-8'
        soup = bs4.BeautifulSoup(response.text, 'html.parser')

        article_info = {}

        title = soup.select('.article-titles h1')
        if title:
            article_info['Title'] = title[0].text
        else:
            article_info['Title'] = "N/A"

        volanta = soup.select('.article-prefix')
        if volanta:
            article_info['Volanta'] = volanta[0].text
        else:
            article_info['Volanta'] = "N/A"

        summary = soup.select('.article-summary')
        if summary:
            article_info['Summary'] = summary[0].text
        else:
            article_info['Summary'] = "N/A"

        body = soup.select('div.article-text')
        if body:
            article_info['Body'] = body[0].text
        else:
            article_info['Body'] = "N/A"

        media = soup.find('div', attrs={'class':'article-main-media-image'})
        if media:
            imagenes = media.find_all('img')
            if len(imagenes) == 0:
                print('Images not found')
            else:
                imagen = imagenes[-1]
                img_src = imagen.get('data-src')
                try:
                    img_req = requests.get(img_src)
                    if img_req.status_code == 200:
                        article_info['Image'] = Image(img_req.content)
                    else:
                        article_info['Image'] = "N/A"
                except:
                    article_info['Image'] = "N/A"

        url = link
        if url:
            article_info['Url'] = link
        else:
            article_info['Url'] = None
    
    except Exception:
            print(f'Link not available: {link}')

    return article_info

if __name__ == '__main__':
    url = 'https://www.pagina12.com.ar'
    print('** Welcome to Pagina12 Web Scraper **')
    get_section(url)```

el diccionario debe salir en orden alfab茅tico?

import requests
from bs4 import BeautifulSoup
import pandas as pd


def  main(url):
    links_secciones = _obtener_secciones(url)
    notas = _obtener_notas(links_secciones)
    data = _obtener_data(notas)
    _save_data(data)

def _obtener_secciones(url):
    request = requests.get(url)
    
    if request.status_code == 200:
        soup = BeautifulSoup(request.text, 'html.parser')
        secciones = soup.find('ul', attrs={'class': 'hot-sections'}).find_all('li')
        links_secciones = [seccion.a.get('href') for seccion in secciones]

    return links_secciones


def _obtener_notas(links_secciones):
    notas = []
    for link in links_secciones:
        try:
            r = requests.get(link)
            if r.status_code == 200:
                soup = BeautifulSoup(r.text, 'html.parser')
                notas.extend(__obtener_urls_notas(soup))
            else:
                print('No se pudo obtener la seccion', link)
        except:
            print('No se pudo obtener la seccion', link)

    return notas


def __obtener_urls_notas(soup):
    '''
    Funcion que recibe un objeto de BeautifulSoup de una pagina 
    de una seccion y devuelve una lista de urls a las notas de esa seccion.
    '''
    lista_notas = []

    # Obtengo el articulo promocionado
    featured_article = soup.find(
        'div', attrs={'class': 'featured-article__container'})
    if featured_article:
        lista_notas.append(featured_article.a.get('href'))

    # Obtengo el listado de los articulos
    article_list = soup.find('ul', attrs={'class': 'article-list'})

    for article in article_list.find_all('li'):

        if article.a:
            lista_notas.append(article.a.get('href'))

    return list(set(lista_notas))


def _obtener_data(notas):
    data = []

    for i, nota in enumerate(notas):
        print(f'Scrapeando nota {i}/{len(notas)}')
        data.append(__scrape_nota(nota))
    return data


def __scrape_nota(url):
    try:
        nota = requests.get(url)
    except Excepton as e:
        print(f'Error scrapeando ULR {url}')
        print(e)
        return None

    if nota.status_code != 200:
        print('fError obteniendo nota {url}')
        print(f'Status code = {nota.status_code}')
        return None

    s_nota = BeautifulSoup(nota.text, 'html.parser')

    ret_dict = __obtener_info(s_nota)
    ret_dict['url'] = url

    return ret_dict


def __obtener_info(s_nota):
    # Creamos un diccionario vacio para probarlo con la informacion
    ret_dict = {}

    # Extraemos la fecha
    fecha = s_nota.find('span', attrs={'pubdate': 'pubdate'})
    if fecha:
        ret_dict['fecha'] = fecha.get('datetime')
    else:
        ret_dict['fecha'] = None

    # Extraemos el titulo
    titulo = s_nota.find('h1', attrs={'class': 'article-title'})
    if titulo:
        ret_dict['titulo'] = titulo.text
    else:
        ret_dict['titulo'] = None

    # Extraer la volanda
    volanta = s_nota.find('h2', attrs={'class': 'article-prefix'})
    if volanta:
        ret_dict['volanta'] = volanta.get_text()
    else:
        ret_dict['volanta'] = None

    # Extraer copete
    copete = s_nota.find('div', attrs={'class': 'article-summary'})
    if copete:
        ret_dict['copete'] = copete.get_text()
    else:
        ret_dict['copete'] = None

    # Extraer autor
    autor = s_nota.find('div', attrs={'class': 'article-author'})
    if autor:
        ret_dict['autor'] = autor.get_text()
    else:
        ret_dict['autor'] = None

    # Extraer imagen
    media = s_nota.find('div', attrs={'class': 'article-main-media'})
    if media:
        imagenes = media.find_all('img')
        if len(imagenes) == 0:
            print('no se encontraron imagenes')
        else:
            imagen = imagenes[-1]
            img_src = imagen.get('data-src')
            try:
                img_req = requests.get(img_src)
                if img_req.status_code == 200:
                    ret_dict['imagen'] = img_req.content
                else:
                    ret_dict['imagen'] = None
            except:
                print('No se pudo obtener la imagen')
    else:
        print('No se encontro media')

    # Extraerel  cuerpo
    cuerpo = s_nota.find('div', attrs={'class': 'article-text'})
    if cuerpo:
        ret_dict['cuerpo'] = cuerpo.get_text()
    else:
        ret_dict['cuerpo'] = None

    return ret_dict


def _save_data(data):
    df = pd.DataFrame(data)
    df.to_csv('Notas_pagina12.csv')
        
    return df


if __name__ == "__main__":
    url = 'https://www.pagina12.com.ar/'
    main(url)

Buenas, alguien sabe por qu茅 puede ser es este error cuando trato de crear el df = pd.DataFrame(data), muchas gracias!

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-263-039b238b38ef> in <module>
----> 1 df = pd.DataFrame(data)

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in __init__(self, data, index, columns, dtype, copy)
    472                     if is_named_tuple(data[0]) and columns is None:
    473                         columns = data[0]._fields
--> 474                     arrays, columns = to_arrays(data, columns, dtype=dtype)
    475                     columns = ensure_index(columns)
    476 

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in to_arrays(data, columns, coerce_float, dtype)
    462     elif isinstance(data[0], abc.Mapping):
    463         return _list_of_dict_to_arrays(
--> 464             data, columns, coerce_float=coerce_float, dtype=dtype
    465         )
    466     elif isinstance(data[0], ABCSeries):

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in _list_of_dict_to_arrays(data, columns, coerce_float, dtype)
    560         gen = (list(x.keys()) for x in data)
    561         sort = not any(isinstance(d, dict) for d in data)
--> 562         columns = lib.fast_unique_multiple_list_gen(gen, sort=sort)
    563 
    564     # assure that they are of the base dict class and not of derived

pandas\_libs\lib.pyx in pandas._libs.lib.fast_unique_multiple_list_gen()

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\internals\construction.py in <genexpr>(.0)
    558 
    559     if columns is None:
--> 560         gen = (list(x.keys()) for x in data)
    561         sort = not any(isinstance(d, dict) for d in data)
    562         columns = lib.fast_unique_multiple_list_gen(gen, sort=sort)

AttributeError: 'NoneType' object has no attribute 'keys'```

Ya no hay GitHub. Al parecer eliminaron el repositorio

Alguna idea del error?

Excelente clase!

Que gran curso! Nada que ver con el anterior de David Aroesti鈥

Gran modulo, aprendi mucho haciendo experimentos en diferentes paginas.

Mi soluci贸n

#! /usr/bin/env python3
import requests
from bs4 import BeautifulSoup
import pandas as pd
from typing import Dict, List


URL_PAGE_12 = 'https://www.pagina12.com.ar/'


def __validate_request(link):
    response: requests.api = None
    try:
        response: requests.api = requests.get(link)
    except requests.exceptions.RetryError as re:
        print(f'There is a problem with the connection to the page\n{re}')
    except requests.exceptions.ConnectionError as ce:
        print(f"There's an error with the internet or with the  page, "
              f"please check it\n{ce}")
    except Exception as e:
        print(f"{e}")
    if response is not None:
        return response
    return None


def get_hot_section_links():
    hot_section_dict: Dict = {}
    page_12 = __validate_request(URL_PAGE_12)
    if page_12 is not None:
        soup_page = BeautifulSoup(page_12.text, 'lxml')
        hot_sections = soup_page.find('ul', attrs={'class': 'hot-sections'}).find_all('li')
        for section in hot_sections:
            hot_section_dict[section.a.text] = section.a['href']
        return hot_section_dict
    return None


def get_articles_links_hot_section(hot_section_link):
    links = {}
    page_section = __validate_request(hot_section_link)
    if page_section is not None:
        soup_section = BeautifulSoup(page_section.text, 'lxml')
        main_new = soup_section.find('div', attrs={'class': 'featured-article__container'})
        if main_new is not None:
            main_new = main_new.h2
            links[main_new.a.text.strip()] = main_new.a['href']
        others_news = soup_section.find('div', attrs={'class': 'sections-articles'}).find('ul', attrs={
            'class': 'article-list'}).find_all('li')
        for new in others_news:
            if new.text:
                links[new.h2.a.text.strip()] = new.h2.a['href']
        return links
    return None


def get_data_from_news(article_link):
    news = {}
    page_article = __validate_request(article_link)
    if page_article is not None:
        soup_article = BeautifulSoup(page_article.text, 'lxml')
        date = soup_article.find('div', attrs={'class': 'article-info'}).find('div', attrs={'class': 'time'}).span.get(
            'datetime')
        news['date'] = date
        volanta = soup_article.find('div', attrs={'class': 'article-titles'}).find('h2',
                                                                                   attrs={'class': 'article-prefix'})
        if volanta is not None:
            volanta = volanta.text.strip()
            news['volanta'] = volanta
        else:
            news['volanta'] = None
        title = soup_article.find('div', attrs={'class': 'article-titles'}).find('h1', attrs={
            'class': 'article-title'}).text.strip()
        news['title'] = title
        copete = soup_article.find('div', attrs={'class': 'article-titles'}).find('div',
                                                                                  attrs={'class': 'article-summary'})
        if copete is not None:
            news['copete'] = copete.text
        else:
            news['copete'] = None
        try:
            img_url = soup_article.find('div', attrs={'class': 'article-main-media-image'}).find_all('img')
        except Exception as e:
            img_url = None
        img_data = None
        if img_url is not None:
            img_url = img_url[-1]['src']
            img_data = __validate_request(img_url)
        if img_data is not None:
            news['img'] = img_data.content
        else:
            news['img'] = None
        body_raw = soup_article.find('div', attrs={'class': 'article-text'}).find_all('p')
        body_text = ''
        for paragraph in body_raw:
            body_text += paragraph.text.strip() + '\n'
        news['body'] = body_text
        return news
    return None


def data_to_dict():
    data_from_page_12: List[Dict] = []
    hot_section_links = get_hot_section_links()
    if hot_section_links is not None:
        for section_title, section_url in hot_section_links.items():
            print(f"\n\nGetting data from the section {section_title}\n\n")
            articles_section = get_articles_links_hot_section(section_url)
            if articles_section is not None:
                for article_title, article_url in articles_section.items():
                    print(f"Extracting data from the article {article_title}")
                    data_article = get_data_from_news(article_url)
                    if data_article is not None:
                        data_from_page_12.append(data_article)
                        print('The data was extracted')
    return data_from_page_12


def main():
    data = data_to_dict()
    dt = pd.DataFrame(data)
    dt.head(5)
    # dt.to_csv("data.csv")


if __name__ == '__main__':
    main()

Mi soluci贸n al reto de integraci贸n

def scrapear_nota(url):
    #definimos el diccionario que guarda los datos
    datos = dict()
    
    try: # Validar que le est茅s haciendo request a una direcci贸n v谩lida
        nota = requests.get(url)
    except Exception as e:
        print('Error scrapeando la URL: ', url)
        return datos 
        # antes era 'return None', ahora simplemente no regresa nada
    sopa = BeautifulSoup(nota.text, 'lxml') # utilizamos este parser porque es m谩s r谩pido
   
    # Segunta etapa. Guardar el resultado de la request en un diccionario
    
    #Extraer Volanta
    try:
        volanta = sopa.find('h2', attrs={'class':'article-prefix'})
        datos['Volanta'] = volanta.text
    except:
        datos['Volanta'] = None

    #Extraer el titulo
    titulo = sopa.find('h1', attrs = {'class':'article-title'})
    datos['Titulo'] = titulo.text
    
    #Extraer URL de la imagen
    try:
        img_list = sopa.find('div', attrs={'class':'article-main-media'}).find_all('img')
        if len(img_list) != 0: #el tag puede estar, pero estar vaci贸
            imagen = url_imagenes[-1] # seleccionar la 煤ltima imagen (mayor resoluci贸n)
            img_src = imagen.get('data-src')
            datos['URL Imagen'] = img_src
        else: 
            datos['URL Imagen'] = None
    except:
        datos['URL Imagen'] = None

    #Extraer texto de imagen
    try:
        texto_imagen= sopa.find('span', attrs ={'class':'article-main-media-text-image'})
        datos['Leyenda Imagen'] = texto_imagen.text
    except:
        datos['Leyenda Imagen'] = None


    #Extraer fecha
    fecha = sopa.find('div', attrs={'class':'time'})
    datos['Fecha'] = fecha.text

    #Extraer autor
    try:
        autor = sopa.find('div', attrs={'class':'article-author'})
        datos['Autor'] = autor.text
    except:
        datos['Autor'] = None

    #Extraer cuerpo
    cuerpo = sopa.find('div', attrs={'class':'article-text'})
    datos['Cuerpo'] = cuerpo.text
    

    return datos


# ahora si, a todas las noticias le vamos a aplicar el scraper
data = []
for i,note in enumerate(noticias):
    print(f'Scrapeando nota {i} / {len(noticias)}')
    data.append( scrapear_nota(note) )


# luego de hacer el scraping de todas las noticias:
data.pop(52) # sacamos los arreglos vacios

# convertimos en DF utilizando pandas
df = pd.DataFrame(data)
df

Excelente, un gran complemento del curso de Xpath.

Estuvo la primera parte de este curso , por mi parte hice este scraper del diario El Comercio (Per煤)
te trae las ultimas 100 notas publicadas.


import requests
from bs4 import BeautifulSoup
import pandas as pd

def main(url):
    allnotes = scrap_section(url)
    data = obtener_data(allnotes)
    save_data(data)


def scrap_section(url):
    notes = []
    try:
        comercio = requests.get(url)
        if comercio.status_code == 200:
            s = BeautifulSoup(comercio.text, 'lxml')
            notas = s.findAll('div', attrs={
                'class': 'story-item'
            })
            for nota in notas:
                link = 'https://elcomercio.pe'+ nota.h2.a.get('href')
                notes.append(link)
            return notes
        else:
            print('No se pudo obtener la seccion', url)
    except:
        print('No se pudo obtener la seccion', url)


def obtener_data(allnotes):
    data = []
    for i, nota in enumerate(allnotes):
        print(f'Scrapeando nota {i + 1}/{len(allnotes)}')
        data.append(scrap_nota(nota))
    return data

def scrap_nota(url):
    try:
        nota = requests.get(url)
        if nota.status_code == 200:
            s_nota = BeautifulSoup(nota.text, 'html.parser')
            if '/videos/' in url:
                ret_dict = obtener_infovideo(s_nota)
                ret_dict['url'] = url
            else:
                ret_dict = obtener_info(s_nota)
                ret_dict['url'] = url
            return ret_dict
        else:
            print('No se pudo obtener la nota', url)
    except Exception as e:
        print(f'Error {e}')

def obtener_infovideo(s_nota):
    ret_dict = {}
    #Extraemos el titulo
    informacion = s_nota.find('div', attrs={'class': 'section-video__information'})
    if informacion:
        ret_dict['titulo'] = informacion.h1.get_text()
        ret_dict['subtitulo'] = informacion.p.get_text()
        print(ret_dict['titulo'])
    else:
        ret_dict['titulo'] = None
        ret_dict['subtitulo'] = None

    #Extraemos la imagen:
    imagen = s_nota.find('picture').img.get('src')
    try:
        img_req = requests.get(imagen)
        if img_req.status_code == 200:
            ret_dict['imagen'] = imagen
        else:
            ret_dict['imagen'] = None
    except:
        print('No se pudo obtener la imagen')

    #Extraemos fecha de publicacion
    publicacion = s_nota.find('ul', attrs={'class':'section-video__list-text'}).get_text()
    print(publicacion)
    if publicacion:
        ret_dict['publicacion'] = publicacion
    else:
        ret_dict['publicacion'] = None

    return ret_dict



def obtener_info(s_nota):
    ret_dict = {}

    # Extraemos el titulo y subtitulo
    titulo = s_nota.find('div', attrs={'class': 'sht'})
    if titulo:
        ret_dict['titulo'] = titulo.h1.get_text()
        ret_dict['subtitulo'] = titulo.h2.get_text()
    else:
        ret_dict['titulo'] = None
        ret_dict['subtitulo'] = None

    #Extraer imagen:
    imagen = s_nota.find('picture').img.get('src')
    try:
        img_req = requests.get(imagen)
        if img_req.status_code == 200:
            ret_dict['imagen'] = imagen
        else:
            ret_dict['imagen'] = None
    except:
        print('No se pudo obtener la imagen')

    #cuerpo

    cuerpo = s_nota.find('div', attrs={'class': 'story-contents__content'}).section.find_all('p', attrs={'itemprop': 'description'})
    ret_dict['cuerpo'] = ''
    if cuerpo:
        for p in cuerpo:
            ret_dict['cuerpo'] = ret_dict['cuerpo'] + p.get_text()
    else:
        ret_dict['cuerpo'] = None


    # fecha de publicacion y de actualizacion
    publicacion = s_nota.find('div',attrs={'class': 'story-contents__author-date'}).find_all('time')
    if publicacion:
        ret_dict['publicacion'] = publicacion[0].get('datetime')
        ret_dict['actualizacion'] = publicacion[1].get_text()
    else:
        ret_dict['publicacion'] = None
        ret_dict['actualizacion'] = None

    return ret_dict


def save_data(data):
    df = pd.DataFrame(data)
    df.to_csv('ultimasNoticiasComercio.csv')
    return df

if __name__ == "__main__":
    url = 'https://elcomercio.pe/ultimas-noticias/'
    main(url)

buena clase