问题描述
模块
import pandas as pad
import io
字符串数据
test = '\tStart\t \t \tStop\n\t12/12/20 1400\t\t\nrugby \n\t12/16/20 1359\n\t12/12/20 1300\t\t\nsoccer \n\t12/19/20 0859\n\t12/12/20 1300\t\t\nbasketball \n\t12/19/20 0659\n\n\n\n'
变成熊猫数据框
test_raw = io.StringIO(test)
test_df = pad.read_csv(test,sep='\t')
test_df
Unnamed: 0 Start .1 Stop
0 NaN 12/12/20 1400 NaN NaN NaN
1 rugby NaN NaN NaN NaN
2 NaN 12/16/20 1359 NaN NaN NaN
3 NaN 12/12/20 1300 NaN NaN NaN
4 soccer NaN NaN NaN NaN
5 NaN 12/19/20 0859 NaN NaN NaN
6 NaN 12/12/20 1300 NaN NaN NaN
7 basketball NaN NaN NaN NaN
8 NaN 12/19/20 0659 NaN NaN NaN
我如何把它变成:
Start Sport Stop
0 12/12/20 1400 rugby 12/16/20 1359
1 12/12/20 1300 soccer 12/19/20 0859
2 12/12/20 1300 basketball 12/19/20 0659
提前谢谢@
解决方法
import re
import pandas as pd
import numpy as np
test = '\tStart\t \t \tStop\n\t12/12/20 1400\t\t\nrugby \n\t12/16/20 1359\n\t12/12/20 1300\t\t\nsoccer \n\t12/19/20 0859\n\t12/12/20 1300\t\t\nbasketball \n\t12/19/20 0659\n\n\n\n'
# step1 split test by \n
alist = test.split('\n')
# ['\tStart\t \t \tStop',# '\t12/12/20 1400\t\t',# 'rugby ',# '\t12/16/20 1359',# '\t12/12/20 1300\t\t',# 'soccer ',# '\t12/19/20 0859',# 'basketball ',# '\t12/19/20 0659',# '',# '']
# step2 we can see that every row has a location index rule
# 1. Start with index of 1,4,...,1+3n
# 2. Sport with index of 2,5,2+3n
# 3. Stop with index of 3,6,3+3n
Start_col1 = alist[1::3] # -> ['\t12/12/20 1400\t\t','\t12/12/20 1300\t\t','','']
Sport_col2 = alist[2::3] # -> ['rugby ','soccer ','basketball ','']
Stop_col3 = alist[3::3] # -> ['\t12/16/20 1359','\t12/19/20 0859','\t12/19/20 0659','']
# step3 use zip to combine the same location index in a tuple
blist = list(zip(Start_col1,Sport_col2,Stop_col3))
# [('\t12/12/20 1400\t\t','rugby ','\t12/16/20 1359'),# ('\t12/12/20 1300\t\t','\t12/19/20 0859'),'\t12/19/20 0659'),# ('','')]
# step4 convert to dataframe
dfn = pd.DataFrame(blist)
print(dfn)
# 0 1 2
# 0 \t12/12/20 1400\t\t rugby \t12/16/20 1359
# 1 \t12/12/20 1300\t\t soccer \t12/19/20 0859
# 2 \t12/12/20 1300\t\t basketball \t12/19/20 0659
# 3
# step5 strip space
dfn = dfn.applymap(str.strip)
print(dfn)
# 0 1 2
# 0 12/12/20 1400 rugby 12/16/20 1359
# 1 12/12/20 1300 soccer 12/19/20 0859
# 2 12/12/20 1300 basketball 12/19/20 0659
# 3
# step6 delete null rows
cond = dfn[1] == ''
dfn = dfn[~cond]
print(dfn)
# 0 1 2
# 0 12/12/20 1400 rugby 12/16/20 1359
# 1 12/12/20 1300 soccer 12/19/20 0859
# 2 12/12/20 1300 basketball 12/19/20 0659
# step7 set columns of the DataFrame
dfn.columns = ['Start','Sport','Stop']