我正在研究一种确定数据集的最小和最大频率的方法。方法nunique()返回数据的不同值的计数,但它不返回预期的结果。我的目标是
例如,
样本输入数据
A1,A2,A3,Class
2,0.4631338,1.5,3
8,0.7460648,3.0,3
6,0.264391038,2.5,2
5,0.4406713,2.3,1
2,0.410438159,1.5,3
2,0.302901816,1.5,2
6,0.275869396,2.5,3
8,0.084782428,3.0,3
2,0.53226533,1.5,2
8,0.070034818,2.9,1
2,0.668631847,1.5,2期望:(s_temp.nunique()['A1'] == floor(n/2))检查分区中Class属性中的不同值的数目是否为底数(n/2)。
实际:
File "assignment_1.py", line 18, in main
s = entropy_discretization(s)
File "assignment_1.py", line 78, in entropy_discretization
if (maxf(s1)/minf(s1) < 0.5) and (s_temp.nunique()['A1'] == floor(n/2)):
File "C:\Users\physe\AppData\Roaming\Python\Python36\site-packages\pandas\core\generic.py", line 1330, in __nonzero__
f"The truth value of a {type(self).__name__} is ambiguous. "
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().我编写了一个处理数据集的程序
def main():
s = pd.read_csv('A1-dm.csv')
print("******************************************************")
print("Entropy Discretization STARTED")
s = entropy_discretization(s)
print("Entropy Discretization COMPLETED")
def entropy_discretization(s):
I = {}
i = 0
n = s.nunique()['A1']
s_temp = s
s1 = pd.DataFrame()
s2 = pd.DataFrame()
while(uniqueValue(s_temp)):
# Step 1: pick a threshold
threshold = s_temp['A1'].iloc[0]
# Step 2: Partititon the data set into two parttitions
s1 = s[s['A1'] < threshold]
print("s1 after spitting")
print(s1)
print("******************")
s2 = s[s['A1'] >= threshold]
print("s2 after spitting")
print(s2)
print("******************")
print("******************")
print("calculating maxf")
maxf(s['A1'])
print("******************")
# print(maxf(s['A1'])/minf(s['A1']))
if (maxf(s1)/minf(s1) < 0.5) and (s_temp.nunique()['A1'] == floor(n/2)):
break
# Step 3: calculate the information gain.
informationGain = information_gain(s1,s2,s_temp)
I.update({f'informationGain_{i}':informationGain,f'threshold_{i}': threshold})
print(f'added informationGain_{i}: {informationGain}, threshold_{i}: {threshold}')
s_temp = s_temp[s_temp['A1'] != threshold]
i += 1
# Step 5: calculate the min information gain
n = int(((len(I)/2)-1))
print("Calculating maximum threshold")
print("*****************************")
maxInformationGain = 0
maxThreshold = 0
for i in range(0, n):
if(I[f'informationGain_{i}'] > maxInformationGain):
maxInformationGain = I[f'informationGain_{i}']
maxThreshold = I[f'threshold_{i}']
print(f'maxThreshold: {maxThreshold}, maxInformationGain: {maxInformationGain}')
# replace all values in s1 with 1
print(s1)
print('***********************************')
# replace all values in s2 with 2
print(s2)
# Step 6: keep the partitions of S based on the value of threshold_i
return s #maxPartition(maxInformationGain,maxThreshold,s,s1,s2)
def maxf(s):
return s.max()
def minf(s):
return s.min()
def uniqueValue(s):
# are records in s the same? return true
if s.nunique()['A1'] == 1:
return False
# otherwise false
else:
return True
def maxPartition(maxInformationGain,maxThreshold,s,s1,s2):
print(f'informationGain: {maxInformationGain}, threshold: {maxThreshold}')
merged_partitions = pd.merge(s1,s2)
merged_partitions = pd.merge(merged_partitions,s)
print("Best Partition")
print("***************")
print(merged_partitions)
print("***************")
return merged_partitions
def information_gain(s1, s2, s):
# calculate cardinality for s1
cardinalityS1 = len(pd.Index(s1['A1']).value_counts())
print(f'The Cardinality of s1 is: {cardinalityS1}')
# calculate cardinality for s2
cardinalityS2 = len(pd.Index(s2['A1']).value_counts())
print(f'The Cardinality of s2 is: {cardinalityS2}')
# calculate cardinality of s
cardinalityS = len(pd.Index(s['A1']).value_counts())
print(f'The Cardinality of s is: {cardinalityS}')
# calculate informationGain
informationGain = (cardinalityS1/cardinalityS) * entropy(s1) + (cardinalityS2/cardinalityS) * entropy(s2)
print(f'The total informationGain is: {informationGain}')
return informationGain
def entropy(s):
print("calculating the entropy for s")
print("*****************************")
print(s)
print("*****************************")
# initialize ent
ent = 0
# calculate the number of classes in s
numberOfClasses = s['Class'].nunique()
print(f'Number of classes for dataset: {numberOfClasses}')
value_counts = s['Class'].value_counts()
p = []
for i in range(0,numberOfClasses):
n = s['Class'].count()
# calculate the frequency of class_i in S1
print(f'p{i} {value_counts.iloc[i]}/{n}')
f = value_counts.iloc[i]
pi = f/n
p.append(pi)
print(p)
for pi in p:
ent += -pi*log2(pi)
return ent
main()如果能对此提供任何帮助,我们将不胜感激。我
发布于 2021-10-20 11:43:27
您将得到行中的错误:if (maxf(s1)/minf(s1) < 0.5) and (s_temp.nunique()['A1'] == floor(n/2)):,因为s1是一个具有多列的数据格式,所以您的max函数返回多个值。因此,您收到的错误是:The truth value of a Series is ambiguous.
如果您在dataframe中选择了想要最大值的列,则错误将消失。见下面的例子:
# print(maxf(s['A1'])/minf(s['A1']))
if (maxf(s1['A1'])/minf(s1['A1']) < 0.5) and (s_temp.nunique()['A1'] == floor(n/2)):
break您的示例代码也遗漏了流导入语句:
import pandas as pd
from math import pi, floor, log2https://stackoverflow.com/questions/69645150
复制相似问题