I\'m new to python language and having trouble to write the following method for
ID: 3663110 • Letter: I
Question
I'm new to python language and having trouble to write the following method for a decision tree. Any help would be appreciate.
def count_labels(data):
counts = {}
# TODO: counts should count the labels in data
# e.g. counts = {'spam': 10, 'ham': 4}
return counts
Here are the given two classes for this assignment:
//////////////////////// Tree class starts ////////////////////////////////
from math import log
class Tree:
leaf = True
prediction = None
feature = None
threshold = None
left = None
right = None
def predict(tree, point):
if tree.leaf:
return tree.prediction
i = tree.feature
if (point.values[i] < tree.threshold):
return predict(tree.left, point)
else:
return predict(tree.right, point)
def most_likely_class(prediction):
labels = list(prediction.keys())
probs = list(prediction.values())
return labels[probs.index(max(probs))]
def accuracy(data, predictions):
total = 0
correct = 0
for i in range(len(data)):
point = data[i]
pred = predictions[i]
total += 1
guess = most_likely_class(pred)
if guess == point.label:
correct += 1
return float(correct) / total
def split_data(data, feature, threshold):
left = []
right = []
# TODO: split data into left and right by given feature.
# left should contain points whose values are less than threshold
# right should contain points with values greater than or equal to threshold
return (left, right)
def count_labels(data):
counts = {}
# TODO: counts should count the labels in data
# e.g. counts = {'spam': 10, 'ham': 4}
return counts
def counts_to_entropy(counts):
entropy = 0.0
# TODO: should convert a dictionary of counts into entropy
return entropy
def get_entropy(data):
counts = count_labels(data)
entropy = counts_to_entropy(counts)
return entropy
# This is a correct but inefficient way to find the best threshold to maximize
# information gain.
def find_best_threshold(data, feature):
entropy = get_entropy(data)
best_gain = 0
best_threshold = None
for point in data:
left, right = split_data(data, feature, point.values[feature])
curr = (get_entropy(left)*len(left) + get_entropy(right)*len(right))/len(data)
gain = entropy - curr
if gain > best_gain:
best_gain = gain
best_threshold = point.values[feature]
return (best_gain, best_threshold)
def find_best_threshold_fast(data, feature):
entropy = get_entropy(data)
best_gain = 0
best_threshold = None
# TODO: Write a more efficient method to find the best threshold.
return (best_gain, best_threshold)
def find_best_split(data):
if len(data) < 2:
return None, None
best_feature = None
best_threshold = None
best_gain = 0
# TODO: find the feature and threshold that maximize information gain.
return (best_feature, best_threshold)
def make_leaf(data):
tree = Tree()
counts = count_labels(data)
prediction = {}
for label in counts:
prediction[label] = float(counts[label])/len(data)
tree.prediction = prediction
return tree
def c45(data, max_levels):
if max_levels <= 0:
return make_leaf(data)
# TODO: Construct a decision tree with the data and return it.
# Your algorithm should return a leaf if the maximum level depth is reached
# or if there is no split that gains information, otherwise it should greedily
# choose an feature and threshold to split on and recurse on both partitions
# of the data.
return make_leaf(data)
def submission(train, test):
# TODO: Once your tests pass, make your submission as good as you can!
tree = c45(train, 4)
predictions = []
for point in test:
predictions.append(predict(tree, point))
return predictions
# This might be useful for debugging.
def print_tree(tree):
if tree.leaf:
print "Leaf", tree.prediction
else:
print "Branch", tree.feature, tree.threshold
print_tree(tree.left)
print_tree(tree.right)
//////////////////////////////////// Tree class ends //////////////////////////////
////////////////////////// Point class begins //////////////////////////
class Point:
def __str__(self):
return "<" + self.label + ": " + `self.values` + ">"
def __repr__(self):
return "<" + self.label + ": " + `self.values` + ">"
def __init__(self, label, values):
self.label = label
self.values = values
def get_label(s, labels):
for l in labels:
if l in s:
return l
raise Exception('Label not found', s)
def string_statistics(s):
s = s.lower()
values = [0]*32
total = len(s)
for c in s:
n = ord(c)
if 97 <= n <= 122:
values[n-97] += 1
elif c == '.':
values[26] += 1
elif c == ',':
values[27] += 1
elif c == '?':
values[28] += 1
elif c == '!':
values[29] += 1
elif c in '0123456789':
values[30] += 1
else:
values[31] += 1
for i in range(len(values)):
values[i] /= float(total)
return values
def get_data(filename, labels):
f = open(filename)
data = []
for line in f:
line = line.strip()
label = get_label(line.split("."), labels)
s = open(line).read()
values = string_statistics(s)
data.append(Point(label, values))
return data
def get_spam_train_data():
labels = ("ham", "spam")
return get_data("spam/train.list", labels)
def get_spam_valid_data():
labels = ("ham", "spam")
return get_data("spam/valid.list", labels)
def get_college_data():
data = [
Point('College', [24, 40000]),
Point('No College', [53, 52000]),
Point('No College', [23, 25000]),
Point('College', [25, 77000]),
Point('College', [32, 48000]),
Point('College', [52, 110000]),
Point('College', [22, 38000]),
Point('No College', [43, 44000]),
Point('No College', [52, 27000]),
Point('College', [48, 65000])
]
return data
Explanation / Answer
=> File (train.dat) is for training the tree;
=> File (test.dat) is for testing the tree
import sys
import math
import copy
import matplotlib
import numpy as np
import matplotlib.pyplot as pyplot
with open("train.data") as foo:
content = foo.readlines()
data = []
for i in xrange(len(content)):
data.append((content[i].split(' ')[0]).split(','))
with open("validation.data") as foo1:
content = foo1.readlines()
gbdata1 = []
for i in xrange(len(content)):
gbdata1.append((content[i].split(' ')[0]).split(','))
with open("test.data") as foo2:
content = foo2.readlines()
gbdata2 = []
for i in xrange(len(content)):
gbdata2.append((content[i].split(' ')[0]).split(','))
gbdata0 = data
D1 = []
D2 = []
D3 = []
no_attributes = len(data[0]) - 1
list_available = [(i+1) for i in xrange(no_attributes)]
class Tree(object):
def __init__(self):
self.left = None
self.center = None
self.right = None
self.data = None
self.predict = None
def predict_accuracy(data_test,node):
count = 0
temp = node
for i in xrange(len(data_test)):
node = temp
while(True):
if(node.data == -1):
if(data_test[i][0] == 'democrat'):
count += 1
break
if(node.data == -2):
if(data_test[i][0] == 'republican'):
count += 1
break
if(data_test[i][node.data] == 'y'):
node = node.right
elif(data_test[i][node.data] == 'n'):
node = node.left
else:
node = node.center
accuracy = float(count)/len(data_test)
return accuracy
root = Tree()
counter = 0
def growtree(data,available,node):
global gbdata0,gbdata1,gbdata2,counter
no_examples = len(data)
if(no_examples == 0):
node.data = -1
return
no_democrat = 0
no_republican = 0
i = 0
while(i<no_examples):
if(data[i][0] == 'democrat'):
no_democrat += 1
else:
no_republican += 1
i += 1
if(no_democrat == 0):
node.data = -2
return
if(no_republican == 0):
node.data = -1
return
if(len(available) == 0):
if(no_republican > no_democrat):
node.data = -2
return
else:
node.data = -1
return
error = float("inf")
feature = 0
c_0 = 0
c_1 = 0
c_2 = 0
for j in xrange(len(available)):
no_0 = 0
total_0 = 0
no_1 = 0
total_1 = 0
no_2 = 0
total_2 = 0
temp = 0
for k in xrange(no_examples):
if data[k][available[j]] == 'n':
total_0 += 1
if data[k][0] == 'democrat':
no_0 += 1
elif data[k][available[j]] == 'y':
total_2 += 1
if data[k][0] == 'democrat':
no_2 += 1
else:
total_1 += 1
if data[k][0] == 'democrat':
no_1 += 1
if(total_0 != 0):
if(no_0 == 0):
entropy0 = -1*((total_0-no_0)*math.log(total_0-no_0) - total_0*math.log(total_0))
elif(no_0 == total_0):
entropy0 = -1*(no_0*math.log(no_0) - total_0*math.log(total_0))
else:
entropy0 = -1*(no_0*math.log(no_0) + (total_0-no_0)*math.log(total_0-no_0) - total_0*math.log(total_0))
else:
entropy0 = 0
if(total_1 != 0):
if(no_1 == 0):
entropy1 = -1*((total_1-no_1)*math.log(total_1-no_1) - total_1*math.log(total_1))
elif(no_1 == total_1):
entropy1 = -1*(no_1*math.log(no_1) - total_1*math.log(total_1))
else:
entropy1 = -1*(no_1*math.log(no_1) + (total_1-no_1)*math.log(total_1-no_1) - total_1*math.log(total_1))
else:
entropy1 = 0
if(total_2 != 0):
if(no_2 == 0):
entropy2 = -1*((total_2-no_2)*math.log(total_2-no_2) - total_2*math.log(total_2))
elif(no_2 == total_2):
entropy2 = -1*(no_2*math.log(no_2) - total_2*math.log(total_2))
else:
entropy2 = -1*(no_2*math.log(no_2) + (total_2-no_2)*math.log(total_2-no_2) - total_2*math.log(total_2))
else:
entropy2 = 0
temp += entropy0 + entropy1 + entropy2
if temp<error:
error = temp
feature = available[j]
c_0 = no_0
c_1 = no_1
c_2 = no_2
node.data = feature
if((c_0+c_1+c_2) >= (no_examples-(c_0+c_1+c_2))):
node.predict = -1
else:
node.predict = -2
data0 = []
data1 = []
data2 = []
for j in xrange(no_examples):
if(data[j][feature] == 'n'):
data0.append(data[j])
elif(data[j][feature] == 'y'):
data2.append(data[j])
else:
data1.append(data[j])
if(c_0 >= len(data0)-c_0):
node.left = Tree()
node.left.data = -1
else:
node.left = Tree()
node.left.data = -2
if(c_1 >= len(data1)-c_1):
node.center = Tree()
node.center.data = -1
else:
node.center = Tree()
node.center.data = -2
if(c_2 >= len(data2)-c_2):
node.right = Tree()
node.right.data = -1
else:
node.right = Tree()
node.right.data = -2
counter += 3
tmp = root
#D1.append([counter, predict_accuracy(gbdata0,tmp)])
#D2.append([counter, predict_accuracy(gbdata1,tmp)])
#D3.append([counter, predict_accuracy(gbdata2,tmp)])
available.remove(feature)
available0 = available[:]
available1 = available[:]
available2 = available[:]
node.left = Tree()
growtree(data0,available0,node.left)
node.center = Tree()
growtree(data1,available1,node.center)
node.right = Tree()
growtree(data2,available2,node.right)
growtree(data,list_available,root)
accuracy = predict_accuracy(gbdata1,root)
D = []
def no_nodes(node):
if(node.data == -1 or node.data == -2):
return 1
else:
return 1 + no_nodes(node.left) + no_nodes(node.center) + no_nodes(node.right)
def prune(data,node):
global root
temp = node.data
node.data = node.predict
acc = predict_accuracy(data,root)
node.data = temp
return acc
def traverse(data,node):
global accuracy
if(node.data == -1 or node.data == -2):
return -1,None
acc = prune(data,node)
acc1,node1 = traverse(data,node.left)
acc2,node2 = traverse(data,node.center)
acc3,node3 = traverse(data,node.right)
if(max(acc,acc1,acc2,acc3) >= accuracy):
if(acc == max(acc,acc1,acc2,acc3)):
return acc,node
elif(acc1 == max(acc,acc1,acc2,acc3)):
return acc1,node1
elif(acc2 == max(acc,acc1,acc2,acc3)):
return acc2,node2
else:
return acc3,node3
else:
return -1,None
c = no_nodes(root)
D1.append([c, predict_accuracy(gbdata0,root)])
D2.append([c, predict_accuracy(gbdata1,root)])
D3.append([c, predict_accuracy(gbdata2,root)])
while(True):
acc,node = traverse(gbdata1,root)
if(acc == -1):
break
accuracy = acc
node.data = node.predict
node.left = None
node.center = None
node.right = None
node.predict = None
count = no_nodes(root)
D2.append([count, acc])
tmp = root
D1.append([count, predict_accuracy(gbdata0,tmp)])
D3.append([count, predict_accuracy(gbdata2,tmp)])
D1 = np.array(D1)
D2 = np.array(D2)
D3 = np.array(D3)
pyplot.plot(D1[:,0],D1[:,1],label='$T_{1}(x)$')
pyplot.plot(D2[:,0],D2[:,1],label='$T_{1}(x)$')
pyplot.plot(D3[:,0],D3[:,1],label='$T_{1}(x)$')
pyplot.savefig('example03.png')
Related Questions
drjack9650@gmail.com
Navigate
Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.