首页
学习
活动
专区
圈层
工具
发布
社区首页 >问答首页 >如何实现ID3

如何实现ID3
EN

Data Science用户
提问于 2021-11-26 13:07:05
回答 2查看 129关注 0票数 0

我试图遵循建议的实现ID3的大纲形式

代码语言:javascript
复制
# Step 1- Calculate MC (Message Conveyed) for the given dataset (let us call it file TF) in reference to  the class attribute 
# MC(TF) = -p1*log2(p1) - p2*log2(p2) 
# For n classes MC(TF) = -p1log2(p1) - p2*log2(p2)-...-pn*log2(pn) 
# The probabilities are approximated by relative frequencies. 
# Step 2- Calculate Gain for every attribute in the training set . 
# Loop 1:  
 # For each attribute (Aj) Do: 
# Consider the attribute is a node from which k branches are emanating,  
# where k is the number of unique values in the attribute  
# Temporarily, split the file TF into K new files based on the unique values in the  attribute Aj. 
# Let us call these new files F1, . . ., Fk  
# Total =0; 
# Loop 2 
 # for each new file Fi Do: 
# Calculate MC for the file and call it MC(Fi). 
# Calculate weight for file Fi and call it Weight(Fi) 
Weight(Fi) = |Fi|/|TF| 
# Calculate the weighted MC (WMC) for file Fi 
# WMC(Fi) = Weight(Fi) * MC(Fi) 
# Total = Total + MC(Fi)  
# End of loop 2 
# Calculate Gain of Aj 
# Gain(Aj) = MC(TF) – Total; 
# End of Loop 1 
# The attribute with the highest gain is the winner. 
# Permanently split the file TF into K new files based on the K unique values of the winner  attribute. 
# Remove the winner attribute from all new K files. 
# Now you have the root of the tree (the winner attribute) and this tree has k leaves, and  each leaf has its own dataset.  
# Step 3- Examine dataset of each leaf.  
# If the attribute class has the same value for all the records in the leaf’s dataset,  then mark the leaf as “no split”  
else mark it as “split”.  
# Step 4- For each leaf’s dataset that is marked “Split” Do. 
# The dataset become the new TF  
TF = leaf’s dataset 
# Go to Step 1;  

我为这个程序编写的代码如下:

代码语言:javascript
复制
from numpy.core.defchararray import count
import pandas as pd
import numpy as np
import numpy as np
from math import ceil, floor, log2
from sklearn.decomposition import PCA
from numpy import linalg as LA
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

def calculate_metrics(tp, tn, fn, p, n, fp):
    # calculate the accuracy, error rate, sensitivity, specificity, and precision for the selected classifier in reference to the corresponding test set.
    accuracy = tp + tn /(p+n)
    error_rate = fp + fn /(p + n)
    sensitivity = tp/ p
    precision = tp/ (tp+fp)
    specificity = tn/n

    display_metrics(accuracy, error_rate, sensitivity, precision, specificity)

def display_metrics(accuracy, error_rate, sensitivity, precision, specificity):
    print(f'Accuracy: {accuracy}, Error_rate:{error_rate}, Sensitivity:{sensitivity}, Precision:{precision}, specificity:{specificity}')

def mc(columnName,training_set):
    column = training_set[columnName]
    probs = column.value_counts(normalize=True)
    messageConveyed = -1*np.sum(np.log2(probs)*probs)
    # print(f'mc {messageConveyed}')
    return messageConveyed

def isUnique(s):
    a = s.to_numpy() # s.values (pandas<0.24)
    return (a[0] == a).all()

def ID3(root,training_set,test_set):


    if(root == ""):
        # Step 1- Calculate MC (Message Conveyed) for the given data set in reference to the class attribute
        print(f'Step 1- Calculate MC (Message Conveyed) for the given data set in reference to the class attribute')
        # MC = -p1*log2(p1) - p2*log2(p2)
        # For n classes MC = -p1log2(p1) - p2*log2(p2)-...-pn*log2(pn)

        # For each column calculate the gain.
        numberOfColumns = 0
        mcDictionary = {}
        print('***********************************')
        print('For each column calculate the gain.')
        for (columnName, columnData) in training_set.iteritems():
            messageConveyed = mc(columnName,training_set)
            mcDictionary.update({columnName:round(messageConveyed)})
            numberOfColumns+=1
        print('***********************************')
        print(f'numberOfColumns {numberOfColumns}')
        print(f'mcDictionary {mcDictionary}')


        # The column with the highest gain is the root.
        print(f'The column with the highest gain is the root.')
        values = mcDictionary.values()
        max_value = max(values)
        print(f'The max value is {max_value}')
        # print(f'The max value, {max_value}, is associated with column {columnWithMaximumInformationGain}')
        val_list = list(values)
        columnWithMaximumInformationGain = list(mcDictionary.keys())[list(mcDictionary.values()).index(max_value)]
        print(f'The max value, {max_value}, is associated with column {columnWithMaximumInformationGain}')

        # select the max value from the gain array
        # this is the new root
        root =  columnWithMaximumInformationGain
        print(f'root is {root}')
        print("******************************************")
        print("**************   ROOT   ******************")
        print(f"TF is {root}**********************")
        print("******************************************")
        print(f'isUnique = {isUnique(training_set[root])}')
        if(isUnique(training_set[root])):
            return   
    
    # Step 2 - Repeat for every attribute
    print(f'Step 2 - Repeat for every attribute')
    # Loop 1
    attribute = ""
    maximum       = 0 
    for (F, columnData) in training_set.iteritems():
        print(f'processing attribute {F}')
        # Loop 2
        Total = 0
        uniques = training_set[F].unique()
        for k in uniques:
            print(f'processing branch {k} for {F}')
            # Calculate MC for column
            messageConveyed = mc(F,training_set)

            # Calculate the weight for F
            F_D    = training_set[F].count()
            TF_D   = training_set[root].count()

            weight = F_D/TF_D
            total = weight*messageConveyed
        gain = mcDictionary[root] - total
        if(gain > maximum):
            attribute = F
            maximum   = gain 
        print(f"gain: {gain} for {F}")
    
    print(f'attribute {attribute} has the max gain of {gain}')
    print(f'removing {attribute}')
    root = attribute
    print(f'new root {root} has branches {training_set[root].unique()}')
    print(f'root is {root}')
    print("******************************************")
    print("**************   ROOT   ******************")
    print(f"TF is {root}**********************")
    print("******************************************")
    unique_values = training_set[root].unique()
    datasets = []
    for unique_value in unique_values:
        print(f'processing for file : {unique_value} ')
        df_1 = training_set[training_set[attribute] > unique_value]
        df_2 = training_set[training_set[attribute] <  unique_value]
        datasets.append(df_1)
        datasets.append(df_2)

    del training_set[attribute]
    
    # Step 3 - Examine dataset of each leaf
    print(f'Step 3 - Examine dataset of each leaf')
    print(f'number of datasets {len(datasets)}')
    print("*****************")
    print("printing datasets")
    print("*****************")
    splits = {}
    all_values_same = False
    for df in datasets:
        print(f'Step 4 - for {attribute} dataset check is marked "split"')
        if(df[attribute].is_unique):
            print(f'all values are the same no split')
            all_values_same = True
        else:
            print(f'values are not unique perform split')
            all_values_same = False
            splits.update({"split":df})
        
    if(not all_values_same):
        for split in splits:
            ID3(root,split.get("split"),test_set)
    else:
        ID3(root,training_set,test_set)
            
    print("*****************")

    

 # use the training set to predict the test set.
# use the Assignment 2--Training set to extract rules and test the quality of the extracted rules against the Assignment 2-- Test set for ID3.
test_set = pd.read_csv("Assignment 2--Test set for ID3.csv")
training_set = pd.read_csv("Assignment 2--Training set for ID3.csv")

print('***********************************')
print('TRAINING SET')
print(training_set)
print('***********************************')


print('***********************************')
print('TEST SET')
print(test_set)
print('***********************************')

print(f'test_set: {test_set}')
print(f'training_set: {training_set}')

    

def BayesClassifier(training_set,test_set):
    # use the assignment 2-- training set for Bayes as the training set to classify the records of the assignment 2 test set for bayes
    X = test_set.values
    Y = training_set.values
    clf = GaussianNB()
    clf.fit(X, Y)




# prompt user to select either ID3 or Bayes classifier.
selection = "ID3" #= input("Please enter your selection for either ID3 or Bayes classification: ")
threshold = 0.9   #= input("Please enter a threshold: ")
g         = 0.05   #= input("Please enter a value for g: ")

root = ""
if(selection == "ID3"):
    ID3(root,training_set,test_set)

if(selection == "Bayes"):
    BayesClassifier(training_set,test_set)

该程序的目标是在决策树中对训练数据进行分类,如下所示

代码语言:javascript
复制
           Veriety
          /       \
      Volume      Location

ect。

该程序的数据集如下:赋值2-- ID3.csv的培训集

代码语言:javascript
复制
Venue,color,Model,Category,Location,weight,Veriety,Material,Volume
2,6,4,4,4,2,2,1,1
1,2,4,4,4,1,6,2,6
1,5,4,4,4,1,2,1,6
2,4,4,4,4,2,6,1,4
1,4,4,4,4,1,2,2,2
2,4,3,3,3,2,1,1,1
1,5,2,1,4,1,6,2,6
1,2,3,3,3,1,2,1,6
2,6,4,4,4,2,3,1,1
1,4,4,4,4,1,2,1,6
1,5,4,4,4,1,2,1,4
1,4,5,5,5,1,6,2,4
2,5,4,4,4,2,3,1,1
1,5,5,5,5,1,6,2,5
2,6,5,5,5,2,2,1,4

测试2- ID3.csv测试集

代码语言:javascript
复制
Venue,color,Model,Category,Location,weight,Veriety,Material,Volume
1,6,4,4,4,1,1,1,6
2,5,4,4,4,2,6,1,1
1,6,2,1,4,1,4,2,4
1,6,2,1,4,1,2,1,2
2,6,5,5,5,2,2,1,2
1,5,4,4,4,1,6,2,2
1,3,3,3,3,1,6,2,2
1,5,2,1,1,1,2,1,2
1,4,4,4,1,1,5,3,6
1,4,4,4,4,1,6,4,6

对此,我衷心感谢您的帮助。我对这个计划的最终结果没有一个明确的理解。如果我能理解这个过程,我就能完成这个程序了。

EN

回答 2

Data Science用户

发布于 2021-11-26 21:32:07

好的,让我们从基础开始:

  • ID3是一种决策树学习算法。这是监督学习,这意味着每个数据点(或实例)都有一些特性x和标签(或类) y。其目标是从一些标记的数据集(提供了xy的数据)中训练模型,以便该模型以后能够预测任何新实例x的标签y
  • 决策树将“决策过程”表示为树:从根开始,每个节点都表示关于输入特性x的问题。通过一个接一个地回答问题并遵循答案的路径,我们最终得到了最终的决定(标签)。ID3是一种从训练数据中建立决策树的算法。因此,ID3的输出是一个完整的决策树。它可以表示为一个节点列表,其中每个节点都有一个关于一个特性的条件,并根据答案(是条件为true或false)指向另外两个节点。
  • ID3过程是递归的,它一个接一个地构建节点,从根(顶部)到叶子(底部)。每次开始时,它都会查看所有可用的特性,以便在当前阶段选择信息最丰富的特性(帮助最大程度地决定标签的特性)。为此,它计算当前数据子集的统计度量(例如,这里的MC ),即根据我们所通过的前一个节点过滤后获得的统计度量。从技术上讲,子集是通过选择条件后在每个节点上分割当前数据来获得的,也就是说,就好像每个节点都有分配给它的特定训练数据集:根有完整的数据集,然后每个节点过滤它们接收的子集,直到不再需要分割的叶子。

希望这有助于澄清总的想法。

票数 2
EN

Data Science用户

发布于 2021-11-26 19:24:58

这部分我想通了。

代码语言:javascript
复制
from numpy.core.defchararray import count
import pandas as pd
import numpy as np
import numpy as np
import math
from math import ceil, floor, log2
from sklearn.decomposition import PCA
from numpy import linalg as LA
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB


# Step 1- Calculate MC (Message Conveyed) for the given dataset (let us call it file TF) in reference to  the class attribute 
# MC(TF) = -p1*log2(p1) - p2*log2(p2) 
def mc(classAttribute,attribute,training_set):
    column = training_set[classAttribute]

    if attribute:
        column = training_set[training_set[classAttribute] == attribute] 

    probs = column.value_counts(normalize=True)
    messageConveyed = -1*np.sum(np.log2(probs)*probs)
    return messageConveyed

def wmc(classAttribute,attribute,training_set):
    attributeCount = len(training_set[training_set[classAttribute] == attribute].index)
    total          = len(training_set[classAttribute].index)
    print(f'{attributeCount}/{total}')
    return attributeCount/total


def ID3(root,training_set,test_set):

    highestGainAttribute = ""
    highestGainValue     = -math.inf
    for classAttribute, values in training_set.iteritems():
        messageConveyed = mc(classAttribute, attribute=None, training_set=training_set)
        print(f"{classAttribute} mc: {messageConveyed}")

        attributes = training_set[classAttribute].unique()
        print(f"{classAttribute}\n")
        weightedMessageConveyed = 0
        for attribute in attributes:
            weight = wmc(classAttribute, attribute, training_set)
            messageConveyed = mc(classAttribute, attribute, training_set)
            print(f"wmc({attribute}) = {weight}")
            weightedMessageConveyed += weight*messageConveyed

        print(f'wmc({classAttribute}) = {weightedMessageConveyed}')
        gain = messageConveyed - weightedMessageConveyed
        print(f'MC - wmc({classAttribute}) = {messageConveyed} - {weightedMessageConveyed} = {gain}')
        if gain > highestGainValue:
            highestGainAttribute = classAttribute
            highestGainValue     = gain
    
    print(f'winner is {highestGainAttribute} with gain of {highestGainValue}')
    root = highestGainAttribute
    leaves = training_set[root].unique()
    splits = {}
    for leaf in  leaves:
        print(f'leaf: {leaf} of root: {root}')
        if training_set[training_set[root] == leaf][root].is_unique:
            print(f'all of the records for leaf: {leaf} are the same. NO SPLIT')
            splits.update({leaf:"no split"})
            return
        else:
            print(f'all of the records for leaf: {leaf} are NOT the same. SPLIT')
            splits.update({leaf:"split"})

    for leaf,split in splits.items():
        if split == "split":
            print(f"setting {leaf} as the new dataset")
            if root in training_set:
                training_set = training_set[training_set[root] == leaf].drop(columns=root)
                ID3(root,training_set,test_set)

# use the training set to predict the test set.
# use the Assignment 2--Training set to extract rules and test the quality of the extracted rules against the Assignment 2-- Test set for ID3.
test_set_ID3 = pd.read_csv("Assignment 2--Test set for ID3.csv")
training_set_ID3 = pd.read_csv("Assignment 2--Training set for ID3.csv")

# prompt user to select either ID3 or Bayes classifier.
selection = "ID3" #= input("Please enter your selection for either ID3 or Bayes classification: ")
threshold = 0.9   #= input("Please enter a threshold: ")
g         = 0.05   #= input("Please enter a value for g: ")

root = ""
if(selection == "ID3"):
    print('***********************************')
    print('TRAINING SET')
    print(training_set_ID3)
    print('***********************************')
    
    print('***********************************')
    print('TEST SET')
    print(test_set_ID3)
    print('***********************************')
    ID3(root,training_set_ID3,test_set_ID3)
```
代码语言:javascript
复制
票数 0
EN
页面原文内容由Data Science提供。腾讯云小微IT领域专用引擎提供翻译支持
原文链接:

https://datascience.stackexchange.com/questions/104516

复制
相关文章

相似问题

领券
问题归档专栏文章快讯文章归档关键词归档开发者手册归档开发者手册 Section 归档