我试图遵循建议的实现ID3的大纲形式
# Step 1- Calculate MC (Message Conveyed) for the given dataset (let us call it file TF) in reference to the class attribute
# MC(TF) = -p1*log2(p1) - p2*log2(p2)
# For n classes MC(TF) = -p1log2(p1) - p2*log2(p2)-...-pn*log2(pn)
# The probabilities are approximated by relative frequencies.
# Step 2- Calculate Gain for every attribute in the training set .
# Loop 1:
# For each attribute (Aj) Do:
# Consider the attribute is a node from which k branches are emanating,
# where k is the number of unique values in the attribute
# Temporarily, split the file TF into K new files based on the unique values in the attribute Aj.
# Let us call these new files F1, . . ., Fk
# Total =0;
# Loop 2
# for each new file Fi Do:
# Calculate MC for the file and call it MC(Fi).
# Calculate weight for file Fi and call it Weight(Fi)
Weight(Fi) = |Fi|/|TF|
# Calculate the weighted MC (WMC) for file Fi
# WMC(Fi) = Weight(Fi) * MC(Fi)
# Total = Total + MC(Fi)
# End of loop 2
# Calculate Gain of Aj
# Gain(Aj) = MC(TF) – Total;
# End of Loop 1
# The attribute with the highest gain is the winner.
# Permanently split the file TF into K new files based on the K unique values of the winner attribute.
# Remove the winner attribute from all new K files.
# Now you have the root of the tree (the winner attribute) and this tree has k leaves, and each leaf has its own dataset.
# Step 3- Examine dataset of each leaf.
# If the attribute class has the same value for all the records in the leaf’s dataset, then mark the leaf as “no split”
else mark it as “split”.
# Step 4- For each leaf’s dataset that is marked “Split” Do.
# The dataset become the new TF
TF = leaf’s dataset
# Go to Step 1; 我为这个程序编写的代码如下:
from numpy.core.defchararray import count
import pandas as pd
import numpy as np
import numpy as np
from math import ceil, floor, log2
from sklearn.decomposition import PCA
from numpy import linalg as LA
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
def calculate_metrics(tp, tn, fn, p, n, fp):
# calculate the accuracy, error rate, sensitivity, specificity, and precision for the selected classifier in reference to the corresponding test set.
accuracy = tp + tn /(p+n)
error_rate = fp + fn /(p + n)
sensitivity = tp/ p
precision = tp/ (tp+fp)
specificity = tn/n
display_metrics(accuracy, error_rate, sensitivity, precision, specificity)
def display_metrics(accuracy, error_rate, sensitivity, precision, specificity):
print(f'Accuracy: {accuracy}, Error_rate:{error_rate}, Sensitivity:{sensitivity}, Precision:{precision}, specificity:{specificity}')
def mc(columnName,training_set):
column = training_set[columnName]
probs = column.value_counts(normalize=True)
messageConveyed = -1*np.sum(np.log2(probs)*probs)
# print(f'mc {messageConveyed}')
return messageConveyed
def isUnique(s):
a = s.to_numpy() # s.values (pandas<0.24)
return (a[0] == a).all()
def ID3(root,training_set,test_set):
if(root == ""):
# Step 1- Calculate MC (Message Conveyed) for the given data set in reference to the class attribute
print(f'Step 1- Calculate MC (Message Conveyed) for the given data set in reference to the class attribute')
# MC = -p1*log2(p1) - p2*log2(p2)
# For n classes MC = -p1log2(p1) - p2*log2(p2)-...-pn*log2(pn)
# For each column calculate the gain.
numberOfColumns = 0
mcDictionary = {}
print('***********************************')
print('For each column calculate the gain.')
for (columnName, columnData) in training_set.iteritems():
messageConveyed = mc(columnName,training_set)
mcDictionary.update({columnName:round(messageConveyed)})
numberOfColumns+=1
print('***********************************')
print(f'numberOfColumns {numberOfColumns}')
print(f'mcDictionary {mcDictionary}')
# The column with the highest gain is the root.
print(f'The column with the highest gain is the root.')
values = mcDictionary.values()
max_value = max(values)
print(f'The max value is {max_value}')
# print(f'The max value, {max_value}, is associated with column {columnWithMaximumInformationGain}')
val_list = list(values)
columnWithMaximumInformationGain = list(mcDictionary.keys())[list(mcDictionary.values()).index(max_value)]
print(f'The max value, {max_value}, is associated with column {columnWithMaximumInformationGain}')
# select the max value from the gain array
# this is the new root
root = columnWithMaximumInformationGain
print(f'root is {root}')
print("******************************************")
print("************** ROOT ******************")
print(f"TF is {root}**********************")
print("******************************************")
print(f'isUnique = {isUnique(training_set[root])}')
if(isUnique(training_set[root])):
return
# Step 2 - Repeat for every attribute
print(f'Step 2 - Repeat for every attribute')
# Loop 1
attribute = ""
maximum = 0
for (F, columnData) in training_set.iteritems():
print(f'processing attribute {F}')
# Loop 2
Total = 0
uniques = training_set[F].unique()
for k in uniques:
print(f'processing branch {k} for {F}')
# Calculate MC for column
messageConveyed = mc(F,training_set)
# Calculate the weight for F
F_D = training_set[F].count()
TF_D = training_set[root].count()
weight = F_D/TF_D
total = weight*messageConveyed
gain = mcDictionary[root] - total
if(gain > maximum):
attribute = F
maximum = gain
print(f"gain: {gain} for {F}")
print(f'attribute {attribute} has the max gain of {gain}')
print(f'removing {attribute}')
root = attribute
print(f'new root {root} has branches {training_set[root].unique()}')
print(f'root is {root}')
print("******************************************")
print("************** ROOT ******************")
print(f"TF is {root}**********************")
print("******************************************")
unique_values = training_set[root].unique()
datasets = []
for unique_value in unique_values:
print(f'processing for file : {unique_value} ')
df_1 = training_set[training_set[attribute] > unique_value]
df_2 = training_set[training_set[attribute] < unique_value]
datasets.append(df_1)
datasets.append(df_2)
del training_set[attribute]
# Step 3 - Examine dataset of each leaf
print(f'Step 3 - Examine dataset of each leaf')
print(f'number of datasets {len(datasets)}')
print("*****************")
print("printing datasets")
print("*****************")
splits = {}
all_values_same = False
for df in datasets:
print(f'Step 4 - for {attribute} dataset check is marked "split"')
if(df[attribute].is_unique):
print(f'all values are the same no split')
all_values_same = True
else:
print(f'values are not unique perform split')
all_values_same = False
splits.update({"split":df})
if(not all_values_same):
for split in splits:
ID3(root,split.get("split"),test_set)
else:
ID3(root,training_set,test_set)
print("*****************")
# use the training set to predict the test set.
# use the Assignment 2--Training set to extract rules and test the quality of the extracted rules against the Assignment 2-- Test set for ID3.
test_set = pd.read_csv("Assignment 2--Test set for ID3.csv")
training_set = pd.read_csv("Assignment 2--Training set for ID3.csv")
print('***********************************')
print('TRAINING SET')
print(training_set)
print('***********************************')
print('***********************************')
print('TEST SET')
print(test_set)
print('***********************************')
print(f'test_set: {test_set}')
print(f'training_set: {training_set}')
def BayesClassifier(training_set,test_set):
# use the assignment 2-- training set for Bayes as the training set to classify the records of the assignment 2 test set for bayes
X = test_set.values
Y = training_set.values
clf = GaussianNB()
clf.fit(X, Y)
# prompt user to select either ID3 or Bayes classifier.
selection = "ID3" #= input("Please enter your selection for either ID3 or Bayes classification: ")
threshold = 0.9 #= input("Please enter a threshold: ")
g = 0.05 #= input("Please enter a value for g: ")
root = ""
if(selection == "ID3"):
ID3(root,training_set,test_set)
if(selection == "Bayes"):
BayesClassifier(training_set,test_set)该程序的目标是在决策树中对训练数据进行分类,如下所示
Veriety
/ \
Volume Locationect。
该程序的数据集如下:赋值2-- ID3.csv的培训集
Venue,color,Model,Category,Location,weight,Veriety,Material,Volume
2,6,4,4,4,2,2,1,1
1,2,4,4,4,1,6,2,6
1,5,4,4,4,1,2,1,6
2,4,4,4,4,2,6,1,4
1,4,4,4,4,1,2,2,2
2,4,3,3,3,2,1,1,1
1,5,2,1,4,1,6,2,6
1,2,3,3,3,1,2,1,6
2,6,4,4,4,2,3,1,1
1,4,4,4,4,1,2,1,6
1,5,4,4,4,1,2,1,4
1,4,5,5,5,1,6,2,4
2,5,4,4,4,2,3,1,1
1,5,5,5,5,1,6,2,5
2,6,5,5,5,2,2,1,4测试2- ID3.csv测试集
Venue,color,Model,Category,Location,weight,Veriety,Material,Volume
1,6,4,4,4,1,1,1,6
2,5,4,4,4,2,6,1,1
1,6,2,1,4,1,4,2,4
1,6,2,1,4,1,2,1,2
2,6,5,5,5,2,2,1,2
1,5,4,4,4,1,6,2,2
1,3,3,3,3,1,6,2,2
1,5,2,1,1,1,2,1,2
1,4,4,4,1,1,5,3,6
1,4,4,4,4,1,6,4,6对此,我衷心感谢您的帮助。我对这个计划的最终结果没有一个明确的理解。如果我能理解这个过程,我就能完成这个程序了。
发布于 2021-11-26 21:32:07
好的,让我们从基础开始:
希望这有助于澄清总的想法。
发布于 2021-11-26 19:24:58
这部分我想通了。
from numpy.core.defchararray import count
import pandas as pd
import numpy as np
import numpy as np
import math
from math import ceil, floor, log2
from sklearn.decomposition import PCA
from numpy import linalg as LA
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
# Step 1- Calculate MC (Message Conveyed) for the given dataset (let us call it file TF) in reference to the class attribute
# MC(TF) = -p1*log2(p1) - p2*log2(p2)
def mc(classAttribute,attribute,training_set):
column = training_set[classAttribute]
if attribute:
column = training_set[training_set[classAttribute] == attribute]
probs = column.value_counts(normalize=True)
messageConveyed = -1*np.sum(np.log2(probs)*probs)
return messageConveyed
def wmc(classAttribute,attribute,training_set):
attributeCount = len(training_set[training_set[classAttribute] == attribute].index)
total = len(training_set[classAttribute].index)
print(f'{attributeCount}/{total}')
return attributeCount/total
def ID3(root,training_set,test_set):
highestGainAttribute = ""
highestGainValue = -math.inf
for classAttribute, values in training_set.iteritems():
messageConveyed = mc(classAttribute, attribute=None, training_set=training_set)
print(f"{classAttribute} mc: {messageConveyed}")
attributes = training_set[classAttribute].unique()
print(f"{classAttribute}\n")
weightedMessageConveyed = 0
for attribute in attributes:
weight = wmc(classAttribute, attribute, training_set)
messageConveyed = mc(classAttribute, attribute, training_set)
print(f"wmc({attribute}) = {weight}")
weightedMessageConveyed += weight*messageConveyed
print(f'wmc({classAttribute}) = {weightedMessageConveyed}')
gain = messageConveyed - weightedMessageConveyed
print(f'MC - wmc({classAttribute}) = {messageConveyed} - {weightedMessageConveyed} = {gain}')
if gain > highestGainValue:
highestGainAttribute = classAttribute
highestGainValue = gain
print(f'winner is {highestGainAttribute} with gain of {highestGainValue}')
root = highestGainAttribute
leaves = training_set[root].unique()
splits = {}
for leaf in leaves:
print(f'leaf: {leaf} of root: {root}')
if training_set[training_set[root] == leaf][root].is_unique:
print(f'all of the records for leaf: {leaf} are the same. NO SPLIT')
splits.update({leaf:"no split"})
return
else:
print(f'all of the records for leaf: {leaf} are NOT the same. SPLIT')
splits.update({leaf:"split"})
for leaf,split in splits.items():
if split == "split":
print(f"setting {leaf} as the new dataset")
if root in training_set:
training_set = training_set[training_set[root] == leaf].drop(columns=root)
ID3(root,training_set,test_set)
# use the training set to predict the test set.
# use the Assignment 2--Training set to extract rules and test the quality of the extracted rules against the Assignment 2-- Test set for ID3.
test_set_ID3 = pd.read_csv("Assignment 2--Test set for ID3.csv")
training_set_ID3 = pd.read_csv("Assignment 2--Training set for ID3.csv")
# prompt user to select either ID3 or Bayes classifier.
selection = "ID3" #= input("Please enter your selection for either ID3 or Bayes classification: ")
threshold = 0.9 #= input("Please enter a threshold: ")
g = 0.05 #= input("Please enter a value for g: ")
root = ""
if(selection == "ID3"):
print('***********************************')
print('TRAINING SET')
print(training_set_ID3)
print('***********************************')
print('***********************************')
print('TEST SET')
print(test_set_ID3)
print('***********************************')
ID3(root,training_set_ID3,test_set_ID3)
```https://datascience.stackexchange.com/questions/104516
复制相似问题