问题描述
我正在尝试创建一个用于分层抽样的函数,该函数将使用使用fakerr模块创建的数据帧以及分层,样本大小和随机种子纳入其中。对于样本大小,我希望每个层次中的样本数根据用户输入而有所不同。这是我用于创建数据的代码:
import pandas as pd
import numpy as np
import random as rn#generating random numbers
from faker import Faker
fake = Faker()
frame_fake = pd.DataFrame( [{"region":
fake.random_number(1,fix_len=True),"district": fake.random_number(2,"enum_area": fake.random_number(5,"hhs": fake.random_number(3),"pop": fake.random_number(4),"area": fake.random_number(1)} for x in range(100)])
# check for and remove duplicates from enum area (should be unique)
# before any further analysis
mask= frame_fake.duplicated('enum_area',keep='last')
duplicates = frame_fake[mask]
# print(duplicates)
# drop all except last
frame_fake = frame_fake.drop_duplicates('enum_area',keep='last').sort_values(by='enum_area',ascending=True)
# reset index to have them sequentially after sorting by enum_area and
# drop the old index column
frame_fake = frame_fake.reset_index().drop('index',axis=1)
frame_fake
这是采样代码:
def stratified_custom(data,strata,sample_size,seed=None):
# for this part,we sample 5 enum areas in each strata/region
# we groupby strata and use the transform method with 'count' parameter
# to get strata sizes
data['strat_size'] = data.groupby(strata)[strata].transform('count')
# map input sample size to each strata
data['strat_sample_size'] = data[strata].map(sample_size)
# grouby strata,get sample size per stratum,cast to int and reset
# index.
smp_size = data.groupby(strata)
['strat_sample_size'].unique().astype(int).reset_index()
# groupby strata and select sample per stratum based on the sample size
# for that strata
sample = (data.groupby(strata,group_keys=False)
.apply(lambda x: x.sample(smp_size,random_state=seed)))
# probability of inclusion
sample['inclusion_prob'] =
sample['strat_sample_size']/sample['strat_size']
return sample
s_size={1:7,2:5,3:5,4:5,5:5,6:5,7:5,8:5,9:8} #pass in strata and sample
# size as dict. (key,values)
(stratified_custom(data=frame_fake,strata='region',sample_size=s_size,seed=99).sort_values(by=['region','enum_area'],ascending=True))
但是我收到此错误:
ValueError: The truth value of a DataFrame is ambiguous. Use a.empty,a.bool(),a.item(),a.any() or a.all().
我不知道这个错误在说什么。任何帮助表示赞赏。
解决方法
经过大量研究,我偶然发现了这篇文章https://stackoverflow.com/a/58794577/14198137,并在我的代码中实现了这一点,不仅可以根据不同的样本量进行抽样,还可以使用相同的功能对固定样本进行抽样。这是我的数据代码:
import pandas as pd
import numpy as np
import random as rn
from faker import Faker
Faker.seed(99)
fake = Faker()
frame_fake = pd.DataFrame( [{"region":
fake.random_number(1,fix_len=True),"district":
fake.random_number(2,"enum_area":
fake.random_number(5,"hhs":
fake.random_number(3),"pop":
fake.random_number(4),"area":
rn.randint(1,2)} for x in range(100)])
frame_fake = frame_fake.drop_duplicates('enum_area',keep='last').sort_values(by='enum_area',ascending=True)
frame_fake = frame_fake.reset_index().drop('index',axis=1)
这是分层抽样的更新代码,现在可以使用。
def stratified_custom(data,strata,sample_size,seed=None):
data = data.copy()
data['strat_size'] = data.groupby(strata)[strata].transform('count')
try:
data['strat_sample_size'] = data[strata].map(sample_size)
smp_size = data.set_index(strata)['strat_sample_size'].to_dict()
strat2_sample = (data.groupby(strata,group_keys=False).apply(lambda x: x.sample(smp_size[x.name],random_state=seed)))
strat2_sample['inclusion_prob'] = strat2_sample['strat_sample_size']/strat2_sample['strat_size']
return strat2_sample
except:
data['strat_sample_size'] = sample_size
strat2_sample = (data.groupby(strata,group_keys=False).apply(lambda x: x.sample(sample_size,random_state=seed)))
strat2_sample['inclusion_prob'] = strat2_sample['strat_sample_size']/strat2_sample['strat_size']
return strat2_sample
s_size={1:3,2:9,3:5,4:5,5:5,6:5,7:5,8:5,9:8}
variablesize = (stratified_custom(data=frame_fake,strata='region',sample_size=s_size,seed=99).sort_values(by=['region','enum_area'],ascending=True)).head()
variablesize
fixedsize = (stratified_custom(data=frame_fake,sample_size=3,ascending=True)).head()
fixedsize
可变样本量的输出:
region district enum_area ... strat_size strat_sample_size inclusion_prob
5 1 60 14737 ... 5 3 0.6
26 1 42 34017 ... 5 3 0.6
68 1 31 72092 ... 5 3 0.6
0 2 65 10566 ... 10 9 0.9
15 2 22 25560 ... 10 9 0.9
固定样本量的输出:
region district enum_area ... strat_size strat_sample_size inclusion_prob
5 1 60 14737 ... 5 3 0.6
26 1 42 34017 ... 5 3 0.6
68 1 31 72092 ... 5 3 0.6
38 2 74 48408 ... 10 3 0.3
43 2 15 56365 ... 10 3 0.3
但是我想知道是否有更好的方法来实现这一目标?