分析Python的For循环中包含的数据帧

问题描述

当前情况：

我有一个将二进制目标变量分为“ 1”和“ 0”的函数，然后读取每个变量的所有自变量。该函数还根据类别“ 1”和“ 0”确定每个自变量的KDE，然后计算相交面积：

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

def intersection_area(data,bandwidth,margin,target_variable_name):
        #target_variable_name is the column name of the response variable
        data = data.dropna()
        X = data.drop(columns = [str(target_variable_name)],axis = 1)
        names = list(X.columns)
        new_columns = []
        for column_name in names[:-1]:
            x0= data.loc[data[str(target_variable_name)] == 0,str(column_name)]
            x1= data.loc[data[str(target_variable_name)] == 1,str(column_name)]
            
            kde0 = gaussian_kde(x0,bw_method=bandwidth)
            kde1 = gaussian_kde(x1,bw_method=bandwidth)
            x_min = min(x0.min(),x1.min()) #find the lowest value between two minimum points
            x_max = min(x0.max(),x1.max()) #finds the lowest value between two maximum points
            dx = margin * (x_max - x_min) # add a margin since the kde is wider than the data
            x_min -= dx
            x_max += dx
        
            x = np.linspace(x_min,x_max,500)
            kde0_x = kde0(x)
            kde1_x = kde1(x)
            inters_x = np.minimum(kde0_x,kde1_x)
            area_inters_x = np.trapz(inters_x,x) #intersection of two kde
            print(area_inters_x)

问题： 如果我有n_class = 4，该函数将如下所示：

def intersection_area(data,str(column_name)]
            x2= data.loc[data[str(target_variable_name)] == 2,str(column_name)]
            x3= data.loc[data[str(target_variable_name)] == 3,bw_method=bandwidth)
            kde2 = gaussian_kde(x2,bw_method=bandwidth)
            kde3 = gaussian_kde(x3,x1.min(),x2.min(),x3.min())
            x_max = min(x0.max(),x1.max(),x3.min())

            dx = margin * (x_max - x_min)
            x_min -= dx
            x_max += dx
        
            x = np.linspace(x_min,500)
            kde0_x = kde0(x)
            kde1_x = kde1(x)
            kde2_x = kde1(x)
            kde3_x = kde1(x)
            inters_x = np.minimum(kde0_x,kde1_x,kde2_x,kde3_x)
            area_inters_x = np.trapz(inters_x,x)
            print(area_inters_x)

现在，如果我有n个类的未知数据集怎么办？我正在尝试改进旧代码，以使其对多类数据集变得更健壮，确定给定类的独立变量的KDE并计算区域的交集。但是我被困在x = data.loc[data[str(target_name)] == i,str(column_name)]部分：

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

def intersection_area(data,target_variable_name):
        # Collect the names of the independent variables
        data = data.dropna()
        X = data.drop(columns = [str(target_variable_name)],axis = 1)
        names = list(X.columns)
       
        # determine the number of unique classes from a multi-class and save them as a list.
        classes = []
        for unique_class in data.target_variable_name.unique():
            classes.append(unique_class)

        new_columns = []    
        # for each unique class,run through the different independent variables
        for i in classes:
            for column_name in names[:-1]:
                print(i) #to show the class (target variable: 0,1,...,n)
                print(column_name) #to show the variable name to be analyzed
                '''This is the part where I got stuck'''
                x = data.loc[data[str(target_name)] == i,str(column_name)]

有兴趣复制问题的人的模拟数据集：

from sklearn.datasets import make_classification
#note: to create a binary class target change n_class = 2

X,y = make_classification(n_samples=50000,n_features=6,n_informative=6,n_redundant=0,n_repeated=0,n_classes=4,n_clusters_per_class=3,class_sep=0.95,flip_y=0.2,weights=[0.7,0.2,0.1],shuffle=True,random_state=93)

dataset_x = pd.DataFrame({'var1': X[:,0],'var2': X[:,1],'var3': X[:,2],'var4': X[:,3],'var5': X[:,4],'var6': X[:,5]})

dataset_y = pd.DataFrame({'target': y})

sample_dataset = pd.concat([dataset_x,dataset_y],axis=1)
print(sample_dataset)

解决方法

考虑使用列表理解为每个目标级别的多个类构建x和kde的列表。而不是在每次迭代中打印出结果，而是将结果绑定到数据框中：

def intersection_area_new(data,bandwidth,margin,target_variable_name):
        # Collect the names of the independent variables
        data = data.dropna()
        
        # determine the number of unique classes from a multi-class target variable and save them as a list.
        classes = data['target'].unique()
        
        kde_dicts = []
        for column_name in data.columns[:-1]:
            # BUILD LIST OF x's AND kde's
            x_s = [data.loc[(data[target_variable_name] == i),str(column_name)] for i in classes]
            kde_s = [gaussian_kde(x,bw_method=bandwidth) for x in x_s]
            
            x_min = min([x.min() for x in x_s])              # find the lowest value between two minimum points
            x_max = min([x.max() for x in x_s])              # find the lowest value between two maximum points
                            
            dx = margin * (x_max - x_min)                    # add a margin since the kde is wider than the data
            x_min -= dx
            x_max += dx
    
            x_array = np.linspace(x_min,x_max,500)
            kde_x_s = [kde(x_array) for kde in kde_s]
                        
            inters_x = np.array(kde_x_s).min(axis=0)
            area_inters_x = np.trapz(inters_x,x_array)      # intersection of kdes
            
            kde_dicts.append({'target': target_variable_name,'column': column_name,'intersection': area_inters_x})
        
        return pd.DataFrame(kde_dicts)

输出

output = intersection_area_new(sample_dataset,None,0.5,"target")
print(output.head(10))

#    target column  intersection
# 0  target   var1      0.842256
# 1  target   var2      0.757190
# 2  target   var3      0.676021
# 3  target   var4      0.873074
# 4  target   var5      0.763626
# 5  target   var6      0.868560

itertools numpy pandas pandas python scipy scipy