2015-04-08-数据聚合与分组运算(3)-透视表和交叉表+2012联邦选举委员会数据库

---------------------------------------------------------------------------------------
-----(四)、透视表和交叉表
from pandas import DataFrame,Series
import numpy as np
import pandas as pd
cd L:\czz
tips=pd.read_csv(r'ch08\tips.csv')
tips['tip_pct']=tips['tip']/tips['total_bill']


根据sex和smoker计算分组平均数
tips.pivot_table(rows=['sex','smoker'])


tips.pivot_table(['tip_pct','size'],rows=['sex','day'],cols='smoker')


添加小计
tips.pivot_table(['tip_pct',cols='smoker',margins=True)


使用其他的聚合函数
tips.pivot_table('tip_pct','smoker'],cols='day',aggfunc=len,margins=True)


填充空值
tips.pivot_table('size',rows=['time','sex',aggfunc='sum',fill_value=0)


tips.pivot_table('size',aggfunc='sum')


-----交叉表:crosstab
data=DataFrame({'Sample':[1,2,3,4,5,6,7,8,9,10],
'Gender':['FeMale','Male','FeMale','FeMale'],
'Handedness':['Right-handed','Left-handed','Right-handed','Right-handed']
})


pd.crosstab(data.Gender,data.Handedness,margins=True)


pd.crosstab([tips.time,tips.day],tips.smoker,margins=True)
---------------------------------------------------------------------------------------
-----(五)、2012联邦选举委员会数据库
**** fec=pd.read_csv(r'ch09\P00000001-ALL.csv')
fec.ix[123456]


获取全部的候选人名单
unique_cands=fec.cand_nm.unique()
unique_cands


利用字典说明党派关系
**** parties={
'Bachmann,Michelle':'Republican',
'Cain,Herman':'Republican',
'Gingrich,Newt':'Republican',
'Huntsman,Jon':'Republican',
'Johnson,Gary Earl':'Republican',
'McCotter,Thaddeus G':'Republican',
'Obama,Barack':'Democrat',
'Paul,Ron':'Republican',
'Pawlenty,Timothy':'Republican',
'Perry,Rick':'Republican',
"Roemer,Charles E. 'Buddy' III":'Republican',
'Romney,Mitt':'Republican',
'Santorum,
}


fec.cand_nm[123456:123461]
fec.cand_nm[123456:123461].map(parties)
将其添加一个新列
**** fec['party']=fec.cand_nm.map(parties)
**** fec['party'].value_counts()


该数据既包括赞助也包括退款(负的出资额)
(fec.contb_receipt_amt>0).value_counts()


限定该数据集只能有正的出资额
**** fec=fec[fec.contb_receipt_amt>0]


只包含Obama,Barack和Romney,Mitt的信息
**** fec_mrbo=fec[fec.cand_nm.isin(['Obama,Barack','Romney,Mitt'])]


-----1.根据职业和雇主统计赞助信息
fec.contbr_occupation.value_counts()[:10]
**** occ_mapping={
'informatION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
'informatION REQUESTED':'NOT PROVIDED',
'informatION REQUESTED (BEST EFFORTS)':'NOT PROVIDED',
'C.E.O.':'CEO'
}
#如果没有提供相关映射,则返回x
f=lambda x:occ_mapping.get(x,x)
fec.contbr_occupation=fec.contbr_occupation.map(f)


对雇主信息也进行同样的处理
**** emp_mapping={
'informatION REQUESTED PER BEST EFFORTS':'NOT PROVIDED',
'SELF':'SELF-EMPLOYED',
'SELF EMPLOYED':'SELF-EMPLOYED'
}
f=lambda x:emp_mapping.get(x,x)
fec.contbr_employer=fec.contbr_employer.map(f)


通过pivot_table根据党派和职业对数据进行聚合,然后过滤掉总出资额不足200万美元的数据
**** by_occupation=fec.pivot_table('contb_receipt_amt',
rows='contbr_occupation',
cols='party',
aggfunc='sum'
)
over_2mm=by_occupation[by_occupation.sum(1)>2000000]
over_2mm


做成柱状图
**** over_2mm.plot(kind='barh')


-----2.对Obama和Romney总出资额最高的企业和职业
****
def get_top_amounts(group,key,n=5):
totals=group.groupby(key)['contb_receipt_amt'].sum()
return totals.order(ascending=False)[:n]


grouped=fec_mrbo.groupby('cand_nm')
grouped.apply(get_top_amounts,'contbr_occupation',n=7)


grouped.apply(get_top_amounts,'contbr_employer',n=10)


-----3.对出资额进行分组
bins=np.array([0,1,10,100,1000,10000,100000,1000000,10000000])
labels=pd.cut(fec_mrbo.contb_receipt_amt,bins)


grouped=fec_mrbo.groupby(['cand_nm',labels])
grouped.size().unstack(0)


bucket_sums=grouped.contb_receipt_amt.sum().unstack(0)
bucket_sums


两位候选人收到的各种捐赠额度的总额比例
normed_sums=bucket_sums.div(bucket_sums.sum(axis=1),axis=0)
normed_sums


normed_sums[:-2].plot(kind='barh',stacked=True)


-----4.根据州统计赞助信息
grouped=fec_mrbo.groupby(['cand_nm','contbr_st'])
totals=grouped.contb_receipt_amt.sum().unstack(0).fillna(0)
totals=totals[totals.sum(1)>100000]
totals[:10]


总赞助额比例
percent=totals.div(totals.sum(1),axis=0)
percent[:10]


在地图上画出来?

相关文章

迭代器模式(Iterator)迭代器模式(Iterator)[Cursor]意图...
高性能IO模型浅析服务器端编程经常需要构造高性能的IO模型,...
策略模式(Strategy)策略模式(Strategy)[Policy]意图:定...
访问者模式(Visitor)访问者模式(Visitor)意图:表示一个...
命令模式(Command)命令模式(Command)[Action/Transactio...
生成器模式(Builder)生成器模式(Builder)意图:将一个对...