Gabriel Salvador
Preguntay cómo extraer datos si en la noticia tengo letras en negrilla, subralladas o si en la noticia hay links. Ese comando solo funciona para cuando el texto es plano pero por ejemplo en elcomercio. com ponen palabras en negrita que hace que no funcione ese comando de xpath. He buscado y no encuentro cómo hacer
Gabriel Salvador
y logré hacerlo. En vez de terminar el xpath del body con 'text()' lo elimino. Y luego, cuando veo los párrafos uso el comando text_content()
import requests import lxml.html as html # para aplicar Xpath a HTML import os import datetime HOME_URL = 'https://www.elcomercio.com/' XPATH_LINK_TO_ARTICLE = '//h3[@class="article-highlighted__title"]/a/@href' #links of each of the news XPATH_TITLE = '//h1[@class="entry__title"]/text()' XPATH_BODY = '//div[@class="entry__content"]/p' def parse_notice(link, today): try: response = requests.get(link) if response.status_code == 200: notice = response.content.decode('utf-8')#trae el html de la noticia parsed = html.fromstring(notice) try: title = parsed.xpath(XPATH_TITLE)[0]#extract title #title = title.replace('\"', '')#deletes the character " print(title) body = parsed.xpath(XPATH_BODY) for i in body: print(i.text_content()) except IndexError: print('ha habido un error') return ''' with open(f'{today}/{title}.txt', 'w', encoding='utf-8') as f: f.write(title) f.write('\n\n') f.write(summary) f.write('\n\n') for p in body: f.write(p) f.write('\n') ''' else: raise ValueError(f'Error: {response.status_code}') except ValueError as ve: print(ve) def parse_home(): try: response = requests.get(HOME_URL) if response.status_code == 200:# Status code 200 means that everything is ok home = response.content.decode('utf-8') parsed = html.fromstring(home) links_to_notices = parsed.xpath(XPATH_LINK_TO_ARTICLE) #print(links_to_notices) today = datetime.date.today().strftime('%d-%m-%Y') if not os.path.isdir(today): #os.mkdir(today)#make a dir with the name of the day for link in links_to_notices: print(link) parse_notice(link, today) print('*'*10) else: raise ValueError(f"Error: {response.status_code}") except ValueError as ve: print(ve) def main(): parse_home() if __name__ == '__main__': main()