问题描述
我尝试读取大小超过5gb的数据集。我从kaggle下载的数据集。
train_df = pd.read_csv(f"{PATH}/train.csv",low_memory=False,nrows=10**5,dtype={'row_id': 'int64','timestamp': 'int64','user_id': 'int32','content_id': 'int16','content_type_id': 'int8','task_container_id': 'int16','user_answer': 'int8','answered_correctly': 'int8','prior_question_elapsed_time': 'float32','prior_question_had_explanation': 'boolean'},parse_dates=["timestamp"],date_parser=lambda x: datetime.datetime.fromtimestamp(int(x)))
我尝试了上面的代码,我指定了dtypes,然后将所有时间戳转换为日期时间,但出现了诸如以下的错误:
TypeError Traceback (most recent call last)
c:\users\public\pycharmprojects\codility_test\venv\lib\site-packages\pandas\io\parsers.py in converter(*date_cols)
3358 result = tools.to_datetime(
-> 3359 date_parser(*date_cols),errors="ignore",cache=cache_dates
3360 )
<ipython-input-82-2cac40069ffd> in <lambda>(x)
8 parse_dates=["timestamp"],----> 9 date_parser=lambda x: datetime.datetime.fromtimestamp(int(x)))
10 questions_df = pd.read_csv(f"{PATH}/questions.csv")
TypeError: only size-1 arrays can be converted to Python scalars
During handling of the above exception,another exception occurred:
OverflowError Traceback (most recent call last)
c:\users\public\pycharmprojects\codility_test\venv\lib\site-packages\pandas\io\parsers.py in converter(*date_cols)
3367 parsing.try_parse_dates(
-> 3368 parsing.concat_date_cols(date_cols),3369 parser=date_parser,pandas\_libs\tslibs\parsing.pyx in pandas._libs.tslibs.parsing.concat_date_cols()
pandas\_libs\tslibs\parsing.pyx in pandas._libs.tslibs.parsing.convert_to_unicode()
OverflowError: Python int too large to convert to C long
During handling of the above exception,another exception occurred:
OSError Traceback (most recent call last)
<ipython-input-82-2cac40069ffd> in <module>
7 'prior_question_had_explanation': 'boolean'},8 parse_dates=["timestamp"],----> 9 date_parser=lambda x: datetime.datetime.fromtimestamp(int(x)))
10 questions_df = pd.read_csv(f"{PATH}/questions.csv")
11 lectures_df = pd.read_csv(f"{PATH}/lectures.csv")
c:\users\public\pycharmprojects\codility_test\venv\lib\site-packages\pandas\io\parsers.py in read_csv(filepath_or_buffer,sep,delimiter,header,names,index_col,usecols,squeeze,prefix,mangle_dupe_cols,dtype,engine,converters,true_values,false_values,skipinitialspace,skiprows,skipfooter,nrows,na_values,keep_default_na,na_filter,verbose,skip_blank_lines,parse_dates,infer_datetime_format,keep_date_col,date_parser,dayfirst,cache_dates,iterator,chunksize,compression,thousands,decimal,lineterminator,quotechar,quoting,doublequote,escapechar,comment,encoding,dialect,error_bad_lines,warn_bad_lines,delim_whitespace,low_memory,memory_map,float_precision)
684 )
685
--> 686 return _read(filepath_or_buffer,kwds)
687
688
c:\users\public\pycharmprojects\codility_test\venv\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer,kwds)
456
457 try:
--> 458 data = parser.read(nrows)
459 finally:
460 parser.close()
c:\users\public\pycharmprojects\codility_test\venv\lib\site-packages\pandas\io\parsers.py in read(self,nrows)
1194 def read(self,nrows=None):
1195 nrows = _validate_integer("nrows",nrows)
-> 1196 ret = self._engine.read(nrows)
1197
1198 # May alter columns / col_dict
c:\users\public\pycharmprojects\codility_test\venv\lib\site-packages\pandas\io\parsers.py in read(self,nrows)
2228 data = {k: v for k,(i,v) in zip(names,data)}
2229
-> 2230 names,data = self._do_date_conversions(names,data)
2231 index,names = self._make_index(data,alldata,names)
2232
c:\users\public\pycharmprojects\codility_test\venv\lib\site-packages\pandas\io\parsers.py in _do_date_conversions(self,data)
1968 self.index_names,1969 names,-> 1970 keep_date_col=self.keep_date_col,1971 )
1972
c:\users\public\pycharmprojects\codility_test\venv\lib\site-packages\pandas\io\parsers.py in _process_date_conversion(data_dict,converter,parse_spec,index_names,columns,keep_date_col)
3411 if _isindex(colspec):
3412 continue
-> 3413 data_dict[colspec] = converter(data_dict[colspec])
3414 else:
3415 new_name,col,old_names = _try_convert_dates(
c:\users\public\pycharmprojects\codility_test\venv\lib\site-packages\pandas\io\parsers.py in converter(*date_cols)
3373 )
3374 except Exception:
-> 3375 return generic_parser(date_parser,*date_cols)
3376
3377 return converter
c:\users\public(parse_func,*cols)
36 for i in range(N):
37 args = [c[i] for c in cols]
---> 38 results[i] = parse_func(*args)
39
40 return results
<ipython-input-82-2cac40069ffd> in <lambda>(x)
7 'prior_question_had_explanation': 'boolean'},----> 9 date_parser=lambda x: datetime.datetime.fromtimestamp(int(x)))
10 questions_df = pd.read_csv(f"{PATH}/questions.csv")
11 lectures_df = pd.read_csv(f"{PATH}/lectures.csv")
OSError: [Errno 22] Invalid argument
.shape
train_df.shape
我理解该错误是因为我需要将太多行从时间戳转换为日期时间,但是我想我看到有这样做的方法,但我却忘记了。有人可以帮忙吗?谢谢。
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)