用Python计算信息增益

问题描述

所以我得到了一个当前正在尝试拆分的数据集,然后计算其熵,然后得到信息增益。我认为我已经正确实现了它,但是运行它时却没有得到正确的输出,因为这表明我的索引编制不正确?有谁知道这是怎么回事,或者有其他方法可以对此进行编码?

get_ipython().run_line_magic('matplotlib','inline')
import numpy as np

from sklearn import datasets as ds
from sklearn.decomposition import PCA
from sklearn import preprocessing

import matplotlib.pyplot as plt

data_all = ds.load_breast_cancer()

x = data_all.data
y = data_all.target

y_names = data_all.target_names 

feature_names = data_all.feature_names

split = int(x.shape[0] * 0.6)

x_train = x[:split,:]
y_train = y[:split]

x_test = x[split:,:]
y_test = y[split:]

print('Training set size:',x_train.shape[0])
print('Test set size:',x_test.shape[0])

def calculate_entropy(y):
    
    entropy = 0.0
    n = len(y)
    
    counts = np.bincount(y)
    counts = counts.astype(float)
    
    div = counts / n
    div = div.astype(float)
    
    for element in div:
        if element == 0.0:
            entropy = 0.0
        else:
            entropy -= element * np.log2(element)
    
    return entropy

print("The entropy of 'y' is: {:.4f}".format(calculate_entropy(y)))
    
def find_split(x,y):
    """Given a dataset and its target values,this finds the optimal combination
    of feature and split point that gives the maximum information gain."""
    
    # Need the starting entropy so we can measure improvement...
    start_entropy = calculate_entropy(y)
    
    # Best thus far,initialised to a dud that will be replaced immediately...
    best = {'infogain' : -np.inf}
    # Loop every possible split of every dimension...
    for i in range(x.shape[1]):
        for split in np.unique(x[:,i]):
            
            left_indices = x[i] <= split
            right_indices = x[i] > split
            
            total_left = len(x[i][x[i] <= split])
            total_right = len(x[i][x[i] > split])
            total_of_everything = total_left + total_right
            
            fraction_of_right = total_right/total_of_everything
            fraction_of_left = total_left/total_of_everything
            
            entropy_of_left_split = calculate_entropy(left_indices)
            entropy_of_right_split = calculate_entropy(right_indices)
            
            infogain = start_entropy - (entropy_of_left_split*fraction_of_left) - (entropy_of_right_split*fraction_of_right)
            
            
            if infogain > best['infogain']:
                best = {'feature' : i,'split' : split,'infogain' : infogain,'left_indices' : left_indices,'right_indices' : right_indices}
    return best

解决方法

暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!

如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@)