
Mario Barbero Santillán
PreguntaMe salta un error a la hora de obtener los datos del csv con la librería pandas. No sé como resolverlo

Mario Barbero Santillán
Me da este error al ejecutarlo en la consola de Windows. Pone es por la codificación. ¿Alguien saber por qué?
INFO:__main__:Starting cleaning process INFO:__main__:Reading file eluniversal_2019_01_26_articles.csv Traceback (most recent call last): File "pandas\_libs\parsers.pyx", line 1134, in pandas._libs.parsers.TextReader._convert_tokens File "pandas\_libs\parsers.pyx", line 1240, in pandas._libs.parsers.TextReader._convert_with_dtype File "pandas\_libs\parsers.pyx", line 1256, in pandas._libs.parsers.TextReader._string_convert File "pandas\_libs\parsers.pyx", line 1494, in pandas._libs.parsers._string_box_utf8 UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 8: invalid start byte During handling of the above exception, another exception occurred: Traceback (most recent call last): File "newspaper_receipe.py", line 57, in <module> df = main(args.filename) File "newspaper_receipe.py", line 13, in main df = _red_data(filename) File "newspaper_receipe.py", line 23, in _red_data return pd.read_csv(filename) File "C:\Users\MARIO-PORTATIL\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 678, in parser_f return _read(filepath_or_buffer, kwds) File "C:\Users\MARIO-PORTATIL\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 446, in _read data = parser.read(nrows) File "C:\Users\MARIO-PORTATIL\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 1036, in read ret = self._engine.read(nrows) File "C:\Users\MARIO-PORTATIL\Anaconda3\lib\site-packages\pandas\io\parsers.py", line 1848, in read data = self._reader.read(nrows) File "pandas\_libs\parsers.pyx", line 876, in pandas._libs.parsers.TextReader.read File "pandas\_libs\parsers.pyx", line 891, in pandas._libs.parsers.TextReader._read_low_memory File "pandas\_libs\parsers.pyx", line 968, in pandas._libs.parsers.TextReader._read_rows File "pandas\_libs\parsers.pyx", line 1094, in pandas._libs.parsers.TextReader._convert_column_data File "pandas\_libs\parsers.pyx", line 1141, in pandas._libs.parsers.TextReader._convert_tokens File "pandas\_libs\parsers.pyx", line 1240, in pandas._libs.parsers.TextReader._convert_with_dtype File "pandas\_libs\parsers.pyx", line 1256, in pandas._libs.parsers.TextReader._string_convert File "pandas\_libs\parsers.pyx", line 1494, in pandas._libs.parsers._string_box_utf8 UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa0 in position 8: invalid start byte
Y este es mi código del archivo “newspaper_receipe.py”
import argparse import logging from urllib.parse import urlparse import pandas as pd logging.basicConfig(level = logging.INFO) logger = logging.getLogger(__name__) def main(filename): logger.info('Starting cleaning process') df = _red_data(filename) newspaper_uid = _extract_newspaper_uid(filename) df = _add_newspaper_uid_column(df, newspaper_uid) df = _extract_host(df) return df def _red_data(filename): logger.info('Reading file {}'.format(filename)) return pd.read_csv(filename) def _extract_newspaper_uid(filename): logger.info('Extracting newspaper uid') newspaper_uid = filename.split('_')[0] logger.info('Newspaper uid detected {}'.format(newspaper_uid)) return newspaper_uid def _add_newspaper_uid_column(df, newspaper_uid): logger.info('Filling newspaper_uid column with {}'.format(newspaper_uid)) df['newspaper_uid'] = newspaper_uid return df def _extract_host(df): logger.info('Extracting host from urls') df['host'] = df['url'].apply(lambda url: urlparse(url).netloc) return df if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('filename', help = 'The path to the dirty data', type = str) args = parser.parse_args() df = main(args.filename) print(df)