不同大小的分层抽样

问题描述

我正在尝试创建一个用于分层抽样的函数，该函数将使用使用fakerr模块创建的数据帧以及分层，样本大小和随机种子纳入其中。对于样本大小，我希望每个层次中的样本数根据用户输入而有所不同。这是我用于创建数据的代码：

import pandas as pd
import numpy as np
import random as rn#generating random numbers
from faker import Faker

fake = Faker()
frame_fake = pd.DataFrame( [{"region": 
fake.random_number(1,fix_len=True),"district": fake.random_number(2,"enum_area": fake.random_number(5,"hhs": fake.random_number(3),"pop": fake.random_number(4),"area": fake.random_number(1)} for x in range(100)])
# check for and remove duplicates from enum area (should be unique) 
# before any further analysis
mask= frame_fake.duplicated('enum_area',keep='last')
duplicates = frame_fake[mask]
# print(duplicates)

# drop all except last
frame_fake = frame_fake.drop_duplicates('enum_area',keep='last').sort_values(by='enum_area',ascending=True)
# reset index to have them sequentially after sorting by enum_area and     
# drop the old index column
frame_fake = frame_fake.reset_index().drop('index',axis=1)
frame_fake

这是采样代码：

def stratified_custom(data,strata,sample_size,seed=None):
# for this part,we sample 5 enum areas in each strata/region
# we groupby strata and use the transform method with 'count' parameter     
# to get strata sizes
data['strat_size'] = data.groupby(strata)[strata].transform('count')
# map input sample size to each strata
data['strat_sample_size'] = data[strata].map(sample_size)
# grouby strata,get sample size per stratum,cast to int and reset 
# index.
smp_size = data.groupby(strata)    
['strat_sample_size'].unique().astype(int).reset_index()
# groupby strata and select sample per stratum based on the sample size 
# for that strata
sample = (data.groupby(strata,group_keys=False)
        .apply(lambda x: x.sample(smp_size,random_state=seed)))
# probability of inclusion
sample['inclusion_prob'] = 
sample['strat_sample_size']/sample['strat_size']
return sample
s_size={1:7,2:5,3:5,4:5,5:5,6:5,7:5,8:5,9:8} #pass in strata and sample 
# size as dict. (key,values)
(stratified_custom(data=frame_fake,strata='region',sample_size=s_size,seed=99).sort_values(by=['region','enum_area'],ascending=True))

但是我收到此错误：

ValueError: The truth value of a DataFrame is ambiguous. Use a.empty,a.bool(),a.item(),a.any() or a.all().

我不知道这个错误在说什么。任何帮助表示赞赏。

解决方法

经过大量研究，我偶然发现了这篇文章https://stackoverflow.com/a/58794577/14198137，并在我的代码中实现了这一点，不仅可以根据不同的样本量进行抽样，还可以使用相同的功能对固定样本进行抽样。这是我的数据代码：

import pandas as pd
import numpy as np
import random as rn 
from faker import Faker
Faker.seed(99)
fake = Faker()
frame_fake = pd.DataFrame( [{"region": 

fake.random_number(1,fix_len=True),"district": 
fake.random_number(2,"enum_area": 
fake.random_number(5,"hhs": 
fake.random_number(3),"pop": 
fake.random_number(4),"area": 
rn.randint(1,2)} for x in range(100)])
frame_fake = frame_fake.drop_duplicates('enum_area',keep='last').sort_values(by='enum_area',ascending=True)
frame_fake = frame_fake.reset_index().drop('index',axis=1)

这是分层抽样的更新代码，现在可以使用。

def stratified_custom(data,strata,sample_size,seed=None):
    data = data.copy() 
    data['strat_size'] = data.groupby(strata)[strata].transform('count')
    try: 
        data['strat_sample_size'] = data[strata].map(sample_size)
        smp_size = data.set_index(strata)['strat_sample_size'].to_dict()
        strat2_sample = (data.groupby(strata,group_keys=False).apply(lambda x: x.sample(smp_size[x.name],random_state=seed)))
        strat2_sample['inclusion_prob'] = strat2_sample['strat_sample_size']/strat2_sample['strat_size']
        return strat2_sample
    except:
        data['strat_sample_size'] = sample_size
        strat2_sample = (data.groupby(strata,group_keys=False).apply(lambda x: x.sample(sample_size,random_state=seed)))
        strat2_sample['inclusion_prob'] = strat2_sample['strat_sample_size']/strat2_sample['strat_size']
        return strat2_sample

s_size={1:3,2:9,3:5,4:5,5:5,6:5,7:5,8:5,9:8} 
variablesize = (stratified_custom(data=frame_fake,strata='region',sample_size=s_size,seed=99).sort_values(by=['region','enum_area'],ascending=True)).head()
variablesize 
fixedsize = (stratified_custom(data=frame_fake,sample_size=3,ascending=True)).head()
fixedsize

可变样本量的输出：

region  district  enum_area  ...  strat_size  strat_sample_size  inclusion_prob
5        1        60      14737  ...           5                  3             0.6
26       1        42      34017  ...           5                  3             0.6
68       1        31      72092  ...           5                  3             0.6
0        2        65      10566  ...          10                  9             0.9
15       2        22      25560  ...          10                  9             0.9

固定样本量的输出：

    region  district  enum_area  ...  strat_size  strat_sample_size  inclusion_prob
5        1        60      14737  ...           5                  3             0.6
26       1        42      34017  ...           5                  3             0.6
68       1        31      72092  ...           5                  3             0.6
38       2        74      48408  ...          10                  3             0.3
43       2        15      56365  ...          10                  3             0.3

但是我想知道是否有更好的方法来实现这一目标？

pandas pandas sampling