y cómo extraer datos si en la noticia tengo letras en negrilla, subralladas o si en la noticia hay links. Ese comando solo funciona para ...

Gabriel Salvador

Pregunta

student•hace 4 años

y cómo extraer datos si en la noticia tengo letras en negrilla, subralladas o si en la noticia hay links. Ese comando solo funciona para cuando el texto es plano pero por ejemplo en elcomercio. com ponen palabras en negrita que hace que no funcione ese comando de xpath. He buscado y no encuentro cómo hacer

Gabriel Salvador

student•hace 4 años

y logré hacerlo. En vez de terminar el xpath del body con 'text()' lo elimino. Y luego, cuando veo los párrafos uso el comando text_content()

import requests
import lxml.html as html # para aplicar Xpath a HTML
import os
import datetime

HOME_URL = 'https://www.elcomercio.com/'


XPATH_LINK_TO_ARTICLE = '//h3[@class="article-highlighted__title"]/a/@href' #links of each of the news
XPATH_TITLE = '//h1[@class="entry__title"]/text()'
XPATH_BODY = '//div[@class="entry__content"]/p'

def parse_notice(link, today):
    try: 
        response =  requests.get(link)
        if response.status_code == 200:
            notice = response.content.decode('utf-8')#trae el html de la noticia
            parsed = html.fromstring(notice)

            try:
                title =  parsed.xpath(XPATH_TITLE)[0]#extract title
                #title = title.replace('\"', '')#deletes the character "
                print(title)

                body =  parsed.xpath(XPATH_BODY)

                for i in body:
                    print(i.text_content())

            except IndexError:
                print('ha habido un error')
                return
            '''
            with open(f'{today}/{title}.txt', 'w', encoding='utf-8') as f:
                f.write(title)
                f.write('\n\n')
                f.write(summary)
                f.write('\n\n')
                for p in body:
                    f.write(p)
                    f.write('\n')
            '''
                    
        else:
            raise ValueError(f'Error: {response.status_code}')
    except ValueError as ve:
        print(ve)
        


def parse_home():
    try:
        response = requests.get(HOME_URL)
        
        if response.status_code == 200:# Status code 200 means that everything is ok
            home = response.content.decode('utf-8')
            parsed = html.fromstring(home)
    
            links_to_notices = parsed.xpath(XPATH_LINK_TO_ARTICLE)
            #print(links_to_notices)

            today = datetime.date.today().strftime('%d-%m-%Y')
            if not os.path.isdir(today):
                #os.mkdir(today)#make a dir with the name of the day
                for link in links_to_notices:
                    print(link)
                    parse_notice(link, today)
                    print('*'*10)



            
        else:
            raise ValueError(f"Error: {response.status_code}")


    except ValueError as ve: 
        print(ve)

def main():
    parse_home()

if __name__ == '__main__':
    main()

y cómo extraer datos si en la noticia tengo letras en negrilla, subralladas o si en la noticia hay links. Ese comando solo funciona para ...

Curso de Web Scraping con Python y Xpath

Curso de Web Scraping con Python y Xpath