Python sklearn.tree 模块,DecisionTreeClassifier() 实例源码
我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sklearn.tree.DecisionTreeClassifier()。
def main():
iris = datasets.load_iris()
x = iris.data
y = iris.target
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.5)
clrTree = tree.DecisionTreeClassifier()
clrTree = clrTree.fit(x_train, y_train)
outTree = clrTree.predict(x_test)
clrKN = KNeighborsClassifier()
clrKN = clrKN.fit(x_train, y_train)
outKN = clrKN.predict(x_test)
# Prediction accuracy
print("Accuracy for Decision Tree Classifier: " + str(accuracy_score(y_test, outTree)*100)+"%")
print("Accuracy for KNeighbors Classifier: " + str(accuracy_score(y_test, outKN)*100)+"%")
def get_feature_importance(self,clf, model_name ):
clfs = {'RandomForestClassifier':'feature_importances',
'ExtraTreesClassifier': 'feature_importances',
'AdaBoostClassifier': 'feature_importances',
'LogisticRegression': 'coef',
'svm.SVC': 'coef',
'GradientBoostingClassifier': 'feature_importances',
'GaussianNB': None,
'DecisionTreeClassifier': 'feature_importances',
'SGDClassifier': 'coef',
'KNeighborsClassifier': None,
'linear.SVC': 'coef'}
if clfs[model_name] == 'feature_importances':
return list(clf.feature_importances_)
elif clfs[model_name] == 'coef':
return list(clf.coef_.tolist())
else:
return None
def get_classifier_class(class_name):
name_table = {
'svm': SVC,
'k_neighbors': KNeighborsClassifier,
'gaussian_process': GaussianProcessClassifier,
'decision_tree': DecisionTreeClassifier,
'random_forest': RandomForestClassifier,
'ada_boost': AdaBoostClassifier,
'mlp': MLPClassifier,
'gaussian_naive_bayes': GaussianNB,
'quadratic_discriminant_analysis': QuadraticdiscriminantAnalysis
}
if class_name not in name_table:
raise ValueError('No such classifier')
return name_table[class_name]
def __create_classifiers(self):
classifiers = list()
classifiers.append({"func": linear_model.SGDClassifier(loss="log"),
"name": "sgd"})
classifiers.append({"func": neighbors.KNeighborsClassifier(1, weights='distance'),
"name": "knn1"})
classifiers.append({"func": neighbors.KNeighborsClassifier(3,
"name": "knn3"})
classifiers.append({"func": neighbors.KNeighborsClassifier(5,
"name": "knn5"})
classifiers.append({"func": GaussianNB(),
"name": "naive_bayes"})
# classifiers.append({"func": tree.DecisionTreeClassifier(),"name": "decision_tree"})
# classifiers.append({"func": MLPClassifier(max_iter=10000),"name": "mlp"})
# classifiers.append({"func": RandomForestClassifier(),"name": "random_forest"})
return classifiers
def define_model(self, model, parameters, n_cores = 0):
clfs = {'RandomForestClassifier': RandomForestClassifier(n_estimators=50, n_jobs=7),
'ExtraTreesClassifier': ExtraTreesClassifier(n_estimators=10, n_jobs=7, criterion='entropy'),
'AdaBoostClassifier': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
'LogisticRegression': LogisticRegression(penalty='l1', C=1e5),
'svm.SVC': svm.SVC(kernel='linear', probability=True, random_state=0),
'GradientBoostingClassifier': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
'GaussianNB': GaussianNB(),
'DecisionTreeClassifier': DecisionTreeClassifier(),
'SGDClassifier': SGDClassifier(loss="hinge", penalty="l2",
'KNeighborsClassifier': KNeighborsClassifier(n_neighbors=3),
'linear.SVC': svm.LinearSVC() }
if model not in clfs:
raise ConfigError("Unsupported model {}".format(model))
clf = clfs[model]
clf.set_params(**parameters)
return clf
def __init__(self, isTrain, isOutlierRemoval):
super(ClassificationAdaBoost, self).__init__(isTrain, isOutlierRemoval)
# data preprocessing
self.dataPreprocessing()
self.dt_stump = DecisionTreeClassifier(max_depth=10)
self.ada = AdaBoostClassifier(
base_estimator=self.dt_stump,
learning_rate=1,
n_estimators=7,
algorithm="SAMME.R")
# self.dt_stump = DecisionTreeClassifier(max_depth=14)
# self.ada = AdaBoostClassifier(
# base_estimator=self.dt_stump,
# learning_rate=1,
# n_estimators=50,
# algorithm="SAMME")
def learn_decision_tree(data):
DT = tree.DecisionTreeClassifier(max_depth=7)
scorer = make_scorer(matthews_corrcoef)
for i in range(5):
scores = cross_val_score(DT, data.X_train, data.y_train, cv=10, scoring=scorer)
print("iteration",i, "dt mean:", scores.mean())
scores = list(scores)
print("Decision Tree train scores:\n", scores)
return DT
# DT = DT.fit(train_data[:,:-1],train_data[:,-1])
# predictionsDT = DT.predict(validation_data[:,:-1])
# validating predicions
# dtError = 0
# for i in range(0,len(validation_data)):
# if(validation_data[i][20] != predictionsDT[i]):
# dtError = dtError + 1
# print("DT Error : ",float(dtError)/len(validation_data)*100.0)
def analyseReasonWithDecisonTree(anamolySample,normalSample,name):
data = anamolySample
target = []
for i in range(0,len(anamolySample)):
target.append(1)
data.extend(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(data,target)
dot_data = tree.export_graphviz(clf, out_file=None,feature_names=name,filled = True,special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
s = str(time.time())
graph.write_pdf(s+"DT.pdf")
def main():
iris = load_iris()
test_idx = [0, 50, 100]
# training Data
train_target = np.delete(iris.target, test_idx)
train_data = np.delete(iris.data, test_idx, axis=0)
# testing data
test_target = iris.target[test_idx]
test_data = iris.data[test_idx]
# Train Classifier
clf = tree.DecisionTreeClassifier()
clf = clf.fit(train_data, train_target)
print(clf.predict(test_data))
# Run main
def main():
# 0: smooth,1: bumpy
features = [[130, 0], [140, [150, 1], [170, 1]]
# 0: apple,1: orange
labels = [0, 0, 1, 1]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(features, labels)
# 160,smooth
predict = [[160, 0]]
if (clf.predict(predict)[0]) == int(0):
print('you are describing orange')
elif (clf.predict(predict)[0]) == int(1):
print('you are describing apple')
else:
print('Can\'t Guess')
def score(train_labels, train_features, test_labels, test_features, save_file, use_tree=False):
if use_tree:
train_clf = Classifier(tree.DecisionTreeClassifier())
else:
train_clf = Classifier()
print train_clf.clf
print ''
t_start = time.clock()
train_clf.learn(train_features, train_labels)
t_end = time.clock()
if save_file:
train_clf.save_to_file(open(save_file, 'w'))
p_start = time.clock()
predicted = train_clf.clf.predict(test_features)
p_end = time.clock()
test_labels_t = train_clf.labels.transform(test_labels)
print classification_report(test_labels_t, predicted, target_names=train_clf.labels.classes_)
print 'Training time: %fs' % (t_end - t_start)
print 'Predicting time: %fs' % (p_end - p_start)
print 'Mean squared error: %f' % mean_squared_error(test_labels_t, predicted)
return train_clf.score(test_features, test_labels)
def __init__(
self,data_block, predictors=[],cv_folds=10,
scoring_metric='accuracy',additional_display_metrics=[]):
base_classification.__init__(
self, alg=DecisionTreeClassifier(), data_block=data_block,
predictors=predictors,cv_folds=cv_folds,
scoring_metric=scoring_metric,
additional_display_metrics=additional_display_metrics
)
self.model_output = pd.Series(self.default_parameters)
self.model_output['Feature_Importance'] = "-"
#Set parameters to default values:
self.set_parameters(set_default=True)
def learns(tests,trains,indep=lambda x: x[:-1],
dep = lambda x: x[-1],
rf = Abcd(),
lg = Abcd(),
dt = Abcd(),
nb = Abcd()):
x1,y1,x2,y2= trainTest(tests,indep,dep)
forest = RandomForestClassifier(n_estimators = 50)
forest = forest.fit(x1,y1)
for n,got in enumerate(forest.predict(x2)):
rf(predicted = got, actual = y2[n])
logreg = linear_model.LogisticRegression(C=1e5)
logreg.fit(x1, y1)
for n,got in enumerate(logreg.predict(x2)):
lg(predicted = got, actual = y2[n])
bayes = GaussianNB()
bayes.fit(x1,got in enumerate(bayes.predict(x2)):
nb(predicted = got, actual = y2[n])
dectree = DecisionTreeClassifier(criterion="entropy",
random_state=1)
dectree.fit(x1,got in enumerate(dectree.predict(x2)):
dt(predicted = got, actual = y2[n])
def CART(train, test, tunings=None, smoteit=True, duplicate=True):
" CART"
# Apply random forest Classifier to predict the number of bugs.
if smoteit:
train = SMOTE(train, atleast=50, atmost=101, resample=duplicate)
if not tunings:
clf = DecisionTreeClassifier()
else:
clf = DecisionTreeClassifier(max_depth=int(tunings[0]),
min_samples_split=int(tunings[1]),
min_samples_leaf=int(tunings[2]),
max_features=float(tunings[3] / 100),
max_leaf_nodes=int(tunings[4]),
criterion='entropy')
train_DF = formatData(train)
test_DF = formatData(test)
features = train_DF.columns[:-2]
klass = train_DF[train_DF.columns[-2]]
# set_trace()
clf.fit(train_DF[features].astype('float32'), klass.astype('float32'))
preds = clf.predict(test_DF[test_DF.columns[:-2]].astype('float32')).tolist()
return preds
def __init__(self,
threshold=0.6,
subsample=1.,
estimator=DecisionTreeClassifier(max_depth=6),
n_folds=2,
stratify=True,
random_state=1,
n_jobs=-1):
self.threshold = threshold
self.subsample = subsample
self.estimator = estimator
self.n_folds = n_folds
self.stratify = stratify
self.random_state = random_state
self.n_jobs = n_jobs
self.__Ddrifts = dict()
self.__fitOK = False
def __init__(self,
base_estimator=DecisionTreeClassifier(max_depth=10),
softmax=None,
n_estimators=50,
learning_rate=1.0,
random_state=None,
verbose=False):
super(MILBoostClassifier, self).__init__(
base_estimator=base_estimator,
n_estimators=n_estimators,
learning_rate=learning_rate,
random_state=random_state)
if not isinstance(softmax, softmaxFunction):
raise TypeError("softmax input must be an object of class `softmaxFunction`")
self.softmax_fcn = softmax
self._verbose = verbose
self._bag_labels = None
self._inferred_y = None
self._bag_partitioning = None
def parameterChoosing(self):
# Set the parameters by cross-validation
tuned_parameters = [{'max_depth': range(2,60),
'max_features': ['sqrt', 'log2', None]
}
]
clf = gridsearchcv(DecisionTreeClassifier(max_depth=5), tuned_parameters, cv=5, scoring='precision_weighted')
clf.fit(self.X_train, self.y_train.ravel())
print "Best parameters set found on development set:\n"
print clf.best_params_
print "Grid scores on development set:\n"
for params, mean_score, scores in clf.grid_scores_:
print "%0.3f (+/-%0.03f) for %r\n" % (mean_score, scores.std() * 2, params)
print "Detailed classification report:\n"
y_true, y_pred = self.y_test, clf.predict(self.X_test)
print classification_report(y_true, y_pred)
def get_classifier(self):
algo=self.algo
if algo=="GBT":
return GradientBoostingClassifier()
elif algo=="RF":
return RandomForestClassifier()
elif algo=="ADB":
return AdaBoostClassifier()
elif algo =="DT":
return DecisionTreeClassifier()
elif algo=="NB":
return BernoulliNB()
elif algo=="SGD":
return SGDClassifier()
elif algo=="SVC":
return LinearSVC()
elif algo=="MLPC":
return MLPClassifier(activation='logistic', batch_size='auto',
early_stopping=True, hidden_layer_sizes=(100,), learning_rate='adaptive',
learning_rate_init=0.1, max_iter=5000, random_state=1,
solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
warm_start=False)
return 0
def __init__(self, X, estimator = DecisionTreeClassifier, itern = 20, mode = "sign"):
self.X = X
self.y = y.copy()
self.estimator = estimator
self.mode = mode
self.itern = itern
self.estimators = [] # estimators produced by boosting algorithm
self.alphas = np.array([]) # weights of each boost estimator
self.m = self.X.shape[0] # number of samples
self.w = np.array([1/self.m] * self.m) # weights of samples
self.cls_list = [] # list used to store classes' name and numbers
self.cls0 = y[0]
for i in range(self.m):
if y[i] not in self.cls_list:
self.cls_list.append(y[i])
if y[i] == self.cls0:
self.y[i] = 1
else:
self.y[i] = -1
if len(self.cls_list) != 2:
raise TypeError(
'''This Adaboost only support two-class problem,for multiclass
problem,please use AdaboostMH.''')
self.train()
def __init__(self, mode = "sign"):
self.X = X
self.y = y
self.estimator = estimator
self.itern = itern
self.mode = mode
self.m = self.X.shape[0] # number of samples
self.cls_list = [] # list used to store classes' name and numbers
# if type(y[0]) != np.ndarray:
# self.y = y.reshape(len(y),-1)
for i in range(self.m):
for cls in self.y[i]:
if cls not in self.cls_list:
self.cls_list.append(cls)
self.k = len(self.cls_list) # number of classes
self.boost = self.train()
def __init__(self, code_dic = None, itern = 20):
self.X = X
self.y = y
self.estimator = estimator
self.itern = itern
self.m = self.X.shape[0] # number of samples
self.cls_list = [] # list used to store classes' name and numbers
for i in range(self.m):
if y[i] not in self.cls_list:
self.cls_list.append(y[i])
if code_dic != None:
self.k = len(code_dic[cls_list[0]]) # dimension of encoding space
else:
self.k = len(self.cls_list)
if code_dic == None: # generate default encode dictionary
code_dic = {}
for i in range(self.k):
code = np.array([-1] * self.k)
code[i] = 1
code_dic[self.cls_list[i]] = code
self.code_dic = code_dic #store {label: array-like code}
self.boost = self.train()
def test_no_refit_multiple_metrics():
clf = DecisionTreeClassifier()
scoring = {'score_1': 'accuracy', 'score_2': 'accuracy'}
gs = dcv.gridsearchcv(clf, {'max_depth': [1, 2, 3]}, refit=False,
scoring=scoring)
gs.fit(da_X, da_y)
assert not hasattr(gs, "best_estimator_")
assert not hasattr(gs, "best_index_")
assert not hasattr(gs, "best_score_")
assert not hasattr(gs, "best_params_")
for fn_name in ('predict', 'predict_proba', 'predict_log_proba'):
with pytest.raises(NotFittedError) as exc:
getattr(gs, fn_name)(X)
assert (('refit=False. %s is available only after refitting on the '
'best parameters' % fn_name) in str(exc.value))
def build_decision_tree(filename):
"""
??????????????
"""
f=open(sys.argv[1],'r')
reader=csv.reader(f)
x=[]
y=[]
for line in reader:
if line[1] in ['1','2','3']:#??????,??????
x.append(line[2:4]+line[5:])
y.append(line[1])
x_train,x_test,y_train,y_test=cross_validation.train_test_split(x,y, test_size=0.2, random_state=42)
clf=tree.DecisionTreeClassifier(max_depth=5)
clf=clf.fit(x_train,y_train)
score=clf.score(x_test,y_test)
print score
return clf,score
def decision_tree(self, sensors_set):
features = list(self.dataset.get_sensors_set_features(sensors_set))
print("DECISION TREE.....")
print("CLASSIFICATION BASED ON THESE SENSORS: ", self.dataset.get_remained_sensors(sensors_set))
print("NUMBER OF FEATURES: ", len(features))
train_features, train_classes, test_classes = self.__get_sets_for_classification(
self.dataset.get_train, self.dataset.get_test, features)
classifier_decision_tree = tree.DecisionTreeClassifier()
classifier_decision_tree.fit(train_features, train_classes)
test_prediction = classifier_decision_tree.predict(test_features)
acc = accuracy_score(test_classes, test_prediction)
df_feature = pd.DataFrame(
{'accuracy': acc, 'features': features, 'importance': classifier_decision_tree.feature_importances_})
df_feature = df_feature.sort_values(by='importance', ascending=False)
print("ACCURACY : " + str(acc))
print("END TREE")
if not os.path.exists(const.DIR_RESULTS):
os.makedirs(const.DIR_RESULTS)
df_feature.to_csv(const.DIR_RESULTS + "/" + str(sensors_set) + const.FILE_DECISION_TREE_RESULTS, index=False)
# random forest algorithm training on training al train set and test on all test set
def setUpClass(self):
"""
Set up the unit test by loading the dataset and training a model.
"""
from sklearn.datasets import load_boston
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
scikit_data = load_boston()
scikit_model = DecisionTreeClassifier(random_state = 1)
t = scikit_data.target
target = np.digitize(t, np.histogram(t)[1]) - 1
scikit_model.fit(scikit_data.data, target)
# Save the data and the model
self.scikit_data = scikit_data
self.target = target
self.scikit_model = scikit_model
def fitAndPredict(self):
# classifier = LogisticRegression()
# classifier.fit(self.trainingSet,self.trainingLabel)
# pred_labels = classifier.predict(self.testSet)
# print 'Logistic:'
# print classification_report(self.testLabel,pred_labels)
self.classifier = SVC()
self.classifier.fit(self.trainingSet, self.trainingLabel)
pred_labels = {}
for user in self.testDict:
pred_labels[user] = self.classifier.predict([[self.BDS[user]]])
# print 'SVM:'
# print classification_report(self.testLabel,pred_labels)
# classifier = DecisionTreeClassifier(criterion='entropy')
# classifier.fit(self.trainingSet,self.trainingLabel)
# pred_labels = classifier.predict(self.testSet)
# print 'Decision Tree:'
# print classification_report(self.testLabel,pred_labels)
# return self.trainingSet,self.trainingLabel,self.testSet,self.testLabel
return pred_labels
def fitAndPredict(self):
# classifier = LogisticRegression()
# classifier.fit(self.trainingSet,pred_labels)
pred_labels = {}
classifier = SVC()
classifier.fit(self.trainingSet, self.trainingLabel)
for user in self.testDict:
pred_labels[user] = classifier.predict([[self.MUD[user], self.RUD[user], self.QUD[user]]])
# print 'SVM:'
# print classification_report(self.testLabel,pred_labels)
return pred_labels
# classifier = DecisionTreeClassifier(criterion='entropy')
# classifier.fit(self.trainingSet,self.testLabel
def fitAndPredict(self):
# classifier = LogisticRegression()
# classifier.fit(self.trainingSet,pred_labels)
classifier = SVC()
classifier.fit(self.trainingSet, self.trainingLabel)
pred_labels = {}
for user in self.testDict:
pred_labels[user] = classifier.predict([[self.entropy[user], self.FMD[user]]])
# print 'SVM:'
#print classification_report(self.testLabel,self.testLabel
return pred_labels
def train_decision_tree(file_name):
file = open(file_name, 'r')
train_data = json.load(file)
file.close()
train_list = list()
train_result = list()
for train_pair in train_data:
tmp = formatting_data(train_pair[0])
train_list.append(tmp)
train_result.append(train_pair[1])
my_clf = tree.DecisionTreeClassifier()
my_clf.fit(train_list, train_result)
return my_clf
# Setting up all the necessary preparation
def article_trainers(articles: ArticleDB):
"""
Run repeated models against article db to predict validity score for
articles.
"""
models = [(DecisionTreeClassifier, {}),
(RandomForestClassifier,
(LogisticRegression, {'C': [0.01, 0.1, 10, 100]}),
(MultinomialNB, {'alpha': [0.1, 1.0, 10.0, 100.0]}),
(LinearSVC, 100]})]
trained_models = []
for classifier, param_grid in models:
res = train_model(articles, classifier, param_grid, probabilities=True)
trained_models.append((str(res), res))
ensemble_learner = VotingClassifier(estimators=trained_models[:4],
voting='soft')
train_model(articles, ensemble_learner, {})
def train_model(data, with_mac=True):
global without_mac_clf, mac_clf
df = pd.DataFrame.from_dict(data)
y = df.pop("location")
features = [f for f in df.columns if f is not 'mac']
df = df.rename(columns=dict(zip(features, [POWER_SLAVE_PREFIX + f for f in features])))
model_name = MODEL_MAC_NAME if with_mac else MODEL_NAME
if with_mac:
df = df.apply(LabelEncoder().fit_transform)
else:
df.drop("mac", axis=1, inplace=True)
clf = DecisionTreeClassifier()
clf.fit(df, y)
joblib.dump(clf, model_name)
if with_mac and mac_clf is None:
mac_clf = clf
if not with_mac and without_mac_clf is None:
without_mac_clf = clf
export_graphviz(clf, feature_names=list(df.columns), class_names=y.unique(), filled=True, rounded=True, out_file='model.dot')
os.system("dot -Tpng model.dot -o model.png")
def learn(x, test_x):
# set sample weight
weight_list = []
for j in range(len(y)):
if y[j] == "0":
weight_list.append(variables.weight_0_gdbt_b)
if y[j] == "1000":
weight_list.append(variables.weight_1000_gdbt_b)
if y[j] == "1500":
weight_list.append(variables.weight_1500_gdbt_b)
if y[j] == "2000":
weight_list.append(variables.weight_2000_gdbt_b)
clf = tree.DecisionTreeClassifier(min_samples_split=500).fit(x, weight_list)
print clf.feature_importances_
prediction_list = clf.predict(test_x)
return prediction_list
def use_bagging_classifier():
tree = DecisionTreeClassifier(
criterion='entropy',
max_depth=None,
random_state=3,
)
bag = BaggingClassifier(
base_estimator=tree,
n_estimators=500,
max_samples=1.0,
max_features=1.0,
bootstrap=True,
bootstrap_features=False,
random_state=1
)
return use_ensemble_classifier(tree, 'Decision tree', bag, 'Bagging')
def predict(self):
# classifier = LogisticRegression()
# classifier.fit(self.training,self.trainingLabels)
# pred_labels = classifier.predict(self.test)
# print 'Logistic:'
# print classification_report(self.testLabels,pred_labels)
#
# classifier = SVC()
# classifier.fit(self.training,self.trainingLabels)
# pred_labels = classifier.predict(self.test)
# print 'SVM:'
# print classification_report(self.testLabels,pred_labels)
classifier = DecisionTreeClassifier(criterion='entropy')
classifier.fit(self.training, self.trainingLabels)
pred_labels = classifier.predict(self.test)
print 'Decision Tree:'
return pred_labels
def __init__(self, path, etype, **kwargs):
super(EnsembleModel, self).__init__(path, etype=etype, **kwargs)
self.basedir = "models/ensemble/"
self.goldstd = kwargs.get("goldstd")
self.data = {}
self.offsets = []
self.pipeline = Pipeline(
[
#('clf',SGDClassifier(loss='hinge',penalty='l1',alpha=0.0001,n_iter=5,random_state=42)),
#('clf',SGDClassifier())
# ('clf',svm.NuSVC(nu=0.01 ))
('clf', RandomForestClassifier(class_weight={False:1, True:1}, n_jobs=-1, criterion="entropy", warm_start=True))
# ('clf',tree.DecisionTreeClassifier(criterion="entropy")),
# ('clf',MultinomialNB())
# ('clf',GaussianNB())
#('clf',svm.SVC(kernel="rbf",degree=2,C=1)),svm.SVC(kernel="linear",C=2))
#('clf',DummyClassifier(strategy="constant",constant=True))
])
def __init__(self, classes):
"""
Constructor
:param classes: Classes
:param lang: Spacy language
"""
super(DecisionTree, self).__init__(classes)
# Properties
self._token2index = dict()
self._voc_size = 0
self._samples = list()
self._n_samples = 0
self._tree_classifier = DecisionTreeClassifier(random_state=0)
# end __init__
##############################################
# Public
##############################################
##############################################
# Override
##############################################
# To str
def analyseReasonWithDecisonTree(anamolySample,len(anamolySample)):
target.append(1)
data = data.append(normalSample)
for i in range(0,len(normalSample)):
target.append(0)
print len(data)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(data, target)
dot_data = tree.export_graphviz(clf,special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
s = str(time.time())
graph.write_pdf(s+"DT.pdf")
def analyseReasonWithDecisonTree(anamolySample,normalSample):
data = anamolySample
target = []
for i in range(0, target)
name = []
for i in data.columns:
name.append(i)
dot_data = tree.export_graphviz(clf,special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
s = str(time.time())
graph.write_pdf(s+"DT.pdf")
def analyseReasonWithDecisonTree(anamolySample,special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
s = str(time.time())
graph.write_pdf(s+"DT.pdf")
def analyseReasonWithDecisonTree(anamolySample,special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
s = str(time.time())
graph.write_pdf(s+"DT.pdf")
def train(self, training_set, training_target, fea_index):
clf = tree.DecisionTreeClassifier(criterion="entropy", min_samples_split=30, class_weight="balanced")
clf = clf.fit(training_set, training_target)
class_names = np.unique([str(i) for i in training_target])
feature_names = [attr_list[i] for i in fea_index]
dot_data = tree.export_graphviz(clf,
feature_names=feature_names,
class_names=class_names,
filled=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("output/tree-vis.pdf")
joblib.dump(clf, 'output/CART.pkl')
def performDTClass(X_train, X_test, y_test, fout, savemodel):
"""
Decision Tree Classification
"""
# n = parameters[0]
# l = parameters[1]
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
if savemodel == True:
#fname_out = '{}-{}.pickle'.format(fout,datetime.Now())
fname_out = fout+'.pickle'
with open(fname_out, 'wb') as f:
pickle.dump(clf, f, -1)
accuracy = clf.score(X_test, y_test)
return accuracy
#Todo: use hdf datastructure for dataframes
def clst( X_train, nb_classes):
model = tree.DecisionTreeClassifier()
model.fit( X_train, y_train)
dt_score = model.score( X_test, y_test)
print( "DT-C:", dt_score)
model = svm.SVC( kernel = 'linear')
model.fit( X_train, y_train)
sv_score = model.score( X_test, y_test)
print( "SVC:", sv_score)
model = kkeras.MLPC( [X_train.shape[1], 30, nb_classes])
model.fit( X_train, nb_classes)
mlp_score = model.score( X_test, y_test)
print( "DNN:", mlp_score)
model = ensemble.RandomForestClassifier( n_estimators=10)
model.fit( X_train, y_train)
rf_score = model.score( X_test, y_test)
print( "RF:", rf_score)
return dt_score, sv_score, mlp_score, rf_score
def decision_tree(X, regression, max_depth=3):
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO
from IPython.core.pylabtools import figsize
from IPython.display import Image
figsize(12.5, 6)
import pydot
if regression:
clf = DecisionTreeRegressor(max_depth=max_depth)
else:
clf = DecisionTreeClassifier(max_depth=max_depth)
clf.fit(X, y)
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data, feature_names=list(X.columns),
filled=True,)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
return Image(graph.create_png())
def fit_vanilla(x_train, y_test):
scores = dict()
# Decision tree
dt = DecisionTreeClassifier(random_state=random_state)
scores['dt'] = clf_scores(dt, x_train, y_test)
# Logistic Regression
lr = LogisticRegression(random_state=random_state, n_jobs=-1)
scores['lr'] = clf_scores(lr, y_test)
# Random Forest
rf = RandomForestClassifier(random_state=random_state, n_jobs=-1)
scores['rf'] = clf_scores(rf, y_test)
return scores
def bench_scikit_tree_classifier(X, Y):
"""Benchmark with scikit-learn decision tree classifier"""
from sklearn.tree import DecisionTreeClassifier
gc.collect()
# start time
tstart = datetime.Now()
clf = DecisionTreeClassifier()
clf.fit(X, Y).predict(X)
delta = (datetime.Now() - tstart)
# stop time
scikit_classifier_results.append(
delta.seconds + delta.microseconds / mu_second)
def test_probability():
# Predict probabilities using DecisionTreeClassifier.
for name, Tree in CLF_TREES.items():
clf = Tree(max_depth=1, max_features=1, random_state=42)
clf.fit(iris.data, iris.target)
prob_predict = clf.predict_proba(iris.data)
assert_array_almost_equal(np.sum(prob_predict, 1),
np.ones(iris.data.shape[0]),
err_msg="Failed with {0}".format(name))
assert_array_equal(np.argmax(prob_predict,
clf.predict(iris.data),
err_msg="Failed with {0}".format(name))
assert_almost_equal(clf.predict_proba(iris.data),
np.exp(clf.predict_log_proba(iris.data)), 8,
err_msg="Failed with {0}".format(name))
def test_importances_gini_equal_mse():
# Check that gini is equivalent to mse for binary output variable
X, y = datasets.make_classification(n_samples=2000,
n_features=10,
n_informative=3,
n_redundant=0,
n_repeated=0,
shuffle=False,
random_state=0)
# The gini index and the mean square error (variance) might differ due
# to numerical instability. Since those instabilities mainly occurs at
# high tree depth,we restrict this maximal depth.
clf = DecisionTreeClassifier(criterion="gini", max_depth=5,
random_state=0).fit(X, y)
reg = DecisionTreeRegressor(criterion="mse",
random_state=0).fit(X, y)
assert_almost_equal(clf.feature_importances_, reg.feature_importances_)
assert_array_equal(clf.tree_.feature, reg.tree_.feature)
assert_array_equal(clf.tree_.children_left, reg.tree_.children_left)
assert_array_equal(clf.tree_.children_right, reg.tree_.children_right)
assert_array_equal(clf.tree_.n_node_samples, reg.tree_.n_node_samples)
def test_sample_weight_invalid():
# Check sample weighting raises errors.
X = np.arange(100)[:, np.newaxis]
y = np.ones(100)
y[:50] = 0.0
clf = DecisionTreeClassifier(random_state=0)
sample_weight = np.random.rand(100, 1)
assert_raises(ValueError, clf.fit, sample_weight=sample_weight)
sample_weight = np.array(0)
assert_raises(ValueError, sample_weight=sample_weight)
sample_weight = np.ones(101)
assert_raises(ValueError, sample_weight=sample_weight)
sample_weight = np.ones(99)
assert_raises(ValueError, sample_weight=sample_weight)
def test_huge_allocations():
n_bits = int(platform.architecture()[0].rstrip('bit'))
X = np.random.randn(10, 2)
y = np.random.randint(0, 10)
# Sanity check: we cannot request more memory than the size of the address
# space. Currently raises OverflowError.
huge = 2 ** (n_bits + 1)
clf = DecisionTreeClassifier(splitter='best', max_leaf_nodes=huge)
assert_raises(Exception, y)
# Non-regression test: MemoryError used to be dropped by Cython
# because of missing "except *".
huge = 2 ** (n_bits - 1) - 1
clf = DecisionTreeClassifier(splitter='best', max_leaf_nodes=huge)
assert_raises(MemoryError, y)