尝试在数据集上实现朴素贝叶斯算法

问题描述

我有一个数据集,我想在该数据集上实现朴素贝叶斯算法,但它在第 107 行触发了错误; str_column_to_float(dataset,i) 如下; “无法将字符串转换为浮点数:''” 我认为这是因为各个列的标题,但即使在我删除它们并运行代码之后,它仍然给我同样的错误。任何帮助将不胜感激。数据集链接如下; [数据集][1] 代码如下

# Make Predictions with Naive Bayes On The Accident Project Dataset
from csv import reader
from math import sqrt
from math import exp
from math import pi

# Load a CSV file
def load_csv(filename):
dataset = list()
with open(filename,'r') as file:
    csv_reader = reader(file)
    for row in csv_reader:
        if not row:
            continue
        dataset.append(row)
    return dataset

# Convert string column to float
def str_column_to_float(dataset,column):
for row in dataset:
    row[column] = float(row[column].strip())

# Convert string column to integer
def str_column_to_int(dataset,column):
class_values = [row[column] for row in dataset]
unique = set(class_values)
lookup = dict()
for i,value in enumerate(unique):
    lookup[value] = i
    print('[%s] => %d' % (value,i))
for row in dataset:
    row[column] = lookup[row[column]]
return lookup

# Split the dataset by class values,returns a dictionary
def separate_by_class(dataset):
separated = dict()
for i in range(len(dataset)):
    vector = dataset[i]
    class_value = vector[-1]
    if (class_value not in separated):
        separated[class_value] = list()
    separated[class_value].append(vector)
 return separated

# Calculate the mean of a list of numbers
def mean(numbers):
return sum(numbers)/float(len(numbers))

# Calculate the standard deviation of a list of numbers
def stdev(numbers):
avg = mean(numbers)
variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
return sqrt(variance)

# Calculate the mean,stdev and count for each column in a dataset
def summarize_dataset(dataset):
summaries = [(mean(column),stdev(column),len(column)) for column in zip(*dataset)]
del(summaries[-1])
return summaries

# Split dataset by class then calculate statistics for each row
def summarize_by_class(dataset):
separated = separate_by_class(dataset)
summaries = dict()
for class_value,rows in separated.items():
    summaries[class_value] = summarize_dataset(rows)
return summaries

# Calculate the Gaussian probability distribution function for x
def calculate_probability(x,mean,stdev):
exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
return (1 / (sqrt(2 * pi) * stdev)) * exponent

# Calculate the probabilities of predicting each class for a given row
def calculate_class_probabilities(summaries,row):
total_rows = sum([summaries[label][0][2] for label in summaries])
probabilities = dict()
for class_value,class_summaries in summaries.items():
    probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
    for i in range(len(class_summaries)):
        mean,stdev,_ = class_summaries[i]
        probabilities[class_value] *= calculate_probability(row[i],stdev)
return probabilities

# Predict the class for a given row
def predict(summaries,row):
probabilities = calculate_class_probabilities(summaries,row)
best_label,best_prob = None,-1
for class_value,probability in probabilities.items():
    if best_label is None or probability > best_prob:
        best_prob = probability
        best_label = class_value
return best_label

# Make a prediction with Naive Bayes on Accident Dataset
filename = 'C:/Users/Vince/Desktop/University of Wyoming PHD/Year 2/Machine Learning/Term 
Project/Accident Project dataset.csv'
dataset = load_csv(filename)
for i in range(len(dataset[1])-1):
str_column_to_float(dataset,i)
# convert class column to integers
str_column_to_int(dataset,len(dataset[0])-1)
# fit model
model = summarize_by_class(dataset)
# define a new record
row = [1,1,1]
# predict the label
label = predict(model,row)
print('Data=%s,Predicted: %s' % (row,label))


[1]: https://docs.google.com/spreadsheets/d/1aFJLSYqo59QUYJ6es09ZHY0UBqwH6cbgV4JjxY1HXZo/edit? 
usp=sharing

解决方法

引发 ValueError 是因为 float() 试图将单词转换为字符串。

# Raises the ValueError
float("one")

# Does not raise the ValueError
float("1")

您需要找到非数字的字符串,然后手动转换它。您可以更改代码以帮助您找到它,如下所示:

def str_column_to_float(dataset,column):
    i =0
    try:
        for row in dataset:
            row[column] = float(row[column].strip())
    except ValueError:
        print(f'Change value: {row[column]} on row {i} column {column} to numeric.')
    finally:
        i+=1