I\'m new to python language and having trouble to write the following method for

ID: 3663116 • Letter: I

Question

I'm new to python language and having trouble to write the following method for a decision tree. Any help would be appreciate.

def find_best_split(data):
if len(data) < 2:
return None, None
best_feature = None
best_threshold = None
best_gain = 0
# TODO: find the feature and threshold that maximize information gain.
return (best_feature, best_threshold)

Here are the given two classes for this assignment:

//////////////////////// Tree class starts ////////////////////////////////

from math import log

class Tree:
leaf = True
prediction = None
feature = None
threshold = None
left = None
right = None

def predict(tree, point):
if tree.leaf:
return tree.prediction
i = tree.feature
if (point.values[i] < tree.threshold):
return predict(tree.left, point)
else:
return predict(tree.right, point)

def most_likely_class(prediction):
labels = list(prediction.keys())
probs = list(prediction.values())
return labels[probs.index(max(probs))]

def accuracy(data, predictions):
total = 0
correct = 0
for i in range(len(data)):
point = data[i]
pred = predictions[i]
total += 1
guess = most_likely_class(pred)
if guess == point.label:
correct += 1
return float(correct) / total

def split_data(data, feature, threshold):
left = []
right = []
# TODO: split data into left and right by given feature.
# left should contain points whose values are less than threshold
# right should contain points with values greater than or equal to threshold
return (left, right)

def count_labels(data):
counts = {}
# TODO: counts should count the labels in data
# e.g. counts = {'spam': 10, 'ham': 4}
return counts

def counts_to_entropy(counts):
entropy = 0.0
# TODO: should convert a dictionary of counts into entropy
return entropy

def get_entropy(data):
counts = count_labels(data)
entropy = counts_to_entropy(counts)
return entropy

# This is a correct but inefficient way to find the best threshold to maximize
# information gain.
def find_best_threshold(data, feature):
entropy = get_entropy(data)
best_gain = 0
best_threshold = None
for point in data:
left, right = split_data(data, feature, point.values[feature])
curr = (get_entropy(left)*len(left) + get_entropy(right)*len(right))/len(data)
gain = entropy - curr
if gain > best_gain:
best_gain = gain
best_threshold = point.values[feature]
return (best_gain, best_threshold)

def find_best_threshold_fast(data, feature):
entropy = get_entropy(data)
best_gain = 0
best_threshold = None
# TODO: Write a more efficient method to find the best threshold.
return (best_gain, best_threshold)

def make_leaf(data):
tree = Tree()
counts = count_labels(data)
prediction = {}
for label in counts:
prediction[label] = float(counts[label])/len(data)
tree.prediction = prediction
return tree

def c45(data, max_levels):
if max_levels <= 0:
return make_leaf(data)
# TODO: Construct a decision tree with the data and return it.
# Your algorithm should return a leaf if the maximum level depth is reached
# or if there is no split that gains information, otherwise it should greedily
# choose an feature and threshold to split on and recurse on both partitions
# of the data.
return make_leaf(data)

def submission(train, test):
# TODO: Once your tests pass, make your submission as good as you can!
tree = c45(train, 4)
predictions = []
for point in test:
predictions.append(predict(tree, point))
return predictions

# This might be useful for debugging.
def print_tree(tree):
if tree.leaf:
print "Leaf", tree.prediction
else:
print "Branch", tree.feature, tree.threshold
print_tree(tree.left)
print_tree(tree.right)

//////////////////////////////////// Tree class ends //////////////////////////////

////////////////////////// Point class begins //////////////////////////

class Point:
def __str__(self):
return "<" + self.label + ": " + `self.values` + ">"
def __repr__(self):
return "<" + self.label + ": " + `self.values` + ">"
def __init__(self, label, values):
self.label = label
self.values = values

def get_label(s, labels):
for l in labels:
if l in s:
return l
raise Exception('Label not found', s)

def string_statistics(s):
s = s.lower()
values = [0]*32
total = len(s)
for c in s:
n = ord(c)
if 97 <= n <= 122:
values[n-97] += 1
elif c == '.':
values[26] += 1
elif c == ',':
values[27] += 1
elif c == '?':
values[28] += 1
elif c == '!':
values[29] += 1
elif c in '0123456789':
values[30] += 1
else:
values[31] += 1
for i in range(len(values)):
values[i] /= float(total)
return values

def get_data(filename, labels):
f = open(filename)
data = []
for line in f:
line = line.strip()
label = get_label(line.split("."), labels)
s = open(line).read()
values = string_statistics(s)
data.append(Point(label, values))
return data

def get_spam_train_data():
labels = ("ham", "spam")
return get_data("spam/train.list", labels)

def get_spam_valid_data():
labels = ("ham", "spam")
return get_data("spam/valid.list", labels)

def get_college_data():
data = [
Point('College', [24, 40000]),
Point('No College', [53, 52000]),
Point('No College', [23, 25000]),
Point('College', [25, 77000]),
Point('College', [32, 48000]),
Point('College', [52, 110000]),
Point('College', [22, 38000]),
Point('No College', [43, 44000]),
Point('No College', [52, 27000]),
Point('College', [48, 65000])
]
return data

Explanation / Answer

=> train.dat for training the tree
=> test.dat for testing the tree

import sys
import math
import copy
import matplotlib
import numpy as np
import matplotlib.pyplot as pyplot

with open("train.data") as foo:
content = foo.readlines()
data = []
for i in xrange(len(content)):
data.append((content[i].split(' ')[0]).split(','))

with open("validation.data") as foo1:
content = foo1.readlines()
gbdata1 = []
for i in xrange(len(content)):
gbdata1.append((content[i].split(' ')[0]).split(','))

with open("test.data") as foo2:
content = foo2.readlines()
gbdata2 = []
for i in xrange(len(content)):
gbdata2.append((content[i].split(' ')[0]).split(','))

gbdata0 = data

D1 = []
D2 = []
D3 = []

no_attributes = len(data[0]) - 1
list_available = [(i+1) for i in xrange(no_attributes)]

class Tree(object):
   def __init__(self):
       self.left = None
       self.center = None
       self.right = None
       self.data = None
       self.predict = None

def predict_accuracy(data_test,node):
   count = 0
   temp = node
   for i in xrange(len(data_test)):
       node = temp
       while(True):
           if(node.data == -1):
               if(data_test[i][0] == 'democrat'):
                   count += 1
               break
           if(node.data == -2):
               if(data_test[i][0] == 'republican'):
                   count += 1
               break
           if(data_test[i][node.data] == 'y'):
               node = node.right
           elif(data_test[i][node.data] == 'n'):
               node = node.left
           else:
               node = node.center
   accuracy = float(count)/len(data_test)
   return accuracy

root = Tree()
counter = 0

def growtree(data,available,node):

   global gbdata0,gbdata1,gbdata2,counter
   no_examples = len(data)

   if(no_examples == 0):
       node.data = -1
       return

   no_democrat = 0
   no_republican = 0
   i = 0
   while(i<no_examples):
       if(data[i][0] == 'democrat'):
           no_democrat += 1
       else:
           no_republican += 1
       i += 1

   if(no_democrat == 0):
       node.data = -2
       return
   if(no_republican == 0):
       node.data = -1
       return

   if(len(available) == 0):
       if(no_republican > no_democrat):
           node.data = -2
           return
       else:
           node.data = -1
           return

   error = float("inf")
   feature = 0
   c_0 = 0
   c_1 = 0
   c_2 = 0
   for j in xrange(len(available)):
       no_0 = 0
       total_0 = 0
       no_1 = 0
       total_1 = 0
       no_2 = 0
       total_2 = 0
       temp = 0
       for k in xrange(no_examples):
           if data[k][available[j]] == 'n':
               total_0 += 1
               if data[k][0] == 'democrat':
                   no_0 += 1
           elif data[k][available[j]] == 'y':
               total_2 += 1
               if data[k][0] == 'democrat':
                   no_2 += 1
           else:
               total_1 += 1
               if data[k][0] == 'democrat':
                   no_1 += 1

       if(total_0 != 0):
           if(no_0 == 0):
               entropy0 = -1*((total_0-no_0)*math.log(total_0-no_0) - total_0*math.log(total_0))
           elif(no_0 == total_0):
               entropy0 = -1*(no_0*math.log(no_0) - total_0*math.log(total_0))
           else:
               entropy0 = -1*(no_0*math.log(no_0) + (total_0-no_0)*math.log(total_0-no_0) - total_0*math.log(total_0))
       else:
           entropy0 = 0

       if(total_1 != 0):
           if(no_1 == 0):
               entropy1 = -1*((total_1-no_1)*math.log(total_1-no_1) - total_1*math.log(total_1))
           elif(no_1 == total_1):
               entropy1 = -1*(no_1*math.log(no_1) - total_1*math.log(total_1))
           else:
               entropy1 = -1*(no_1*math.log(no_1) + (total_1-no_1)*math.log(total_1-no_1) - total_1*math.log(total_1))
       else:
           entropy1 = 0

       if(total_2 != 0):
           if(no_2 == 0):
               entropy2 = -1*((total_2-no_2)*math.log(total_2-no_2) - total_2*math.log(total_2))
           elif(no_2 == total_2):
               entropy2 = -1*(no_2*math.log(no_2) - total_2*math.log(total_2))
           else:
               entropy2 = -1*(no_2*math.log(no_2) + (total_2-no_2)*math.log(total_2-no_2) - total_2*math.log(total_2))
       else:
           entropy2 = 0

       temp += entropy0 + entropy1 + entropy2
       if temp<error:
           error = temp
           feature = available[j]
           c_0 = no_0
           c_1 = no_1
           c_2 = no_2

   node.data = feature
   if((c_0+c_1+c_2) >= (no_examples-(c_0+c_1+c_2))):
       node.predict = -1
   else:
       node.predict = -2
   data0 = []
   data1 = []
   data2 = []
   for j in xrange(no_examples):
       if(data[j][feature] == 'n'):
           data0.append(data[j])
       elif(data[j][feature] == 'y'):
           data2.append(data[j])
       else:
           data1.append(data[j])

   if(c_0 >= len(data0)-c_0):
       node.left = Tree()
       node.left.data = -1
   else:
       node.left = Tree()
       node.left.data = -2

   if(c_1 >= len(data1)-c_1):
       node.center = Tree()
       node.center.data = -1
   else:
       node.center = Tree()
       node.center.data = -2

   if(c_2 >= len(data2)-c_2):
       node.right = Tree()
       node.right.data = -1
   else:
       node.right = Tree()
       node.right.data = -2

   counter += 3
   tmp = root
   #D1.append([counter, predict_accuracy(gbdata0,tmp)])
   #D2.append([counter, predict_accuracy(gbdata1,tmp)])
   #D3.append([counter, predict_accuracy(gbdata2,tmp)])

   available.remove(feature)
   available0 = available[:]
   available1 = available[:]
   available2 = available[:]
   node.left = Tree()
   growtree(data0,available0,node.left)
   node.center = Tree()
   growtree(data1,available1,node.center)
   node.right = Tree()
   growtree(data2,available2,node.right)

growtree(data,list_available,root)

accuracy = predict_accuracy(gbdata1,root)
D = []

def no_nodes(node):
   if(node.data == -1 or node.data == -2):
       return 1
   else:
       return 1 + no_nodes(node.left) + no_nodes(node.center) + no_nodes(node.right)

def prune(data,node):
   global root
   temp = node.data
   node.data = node.predict
   acc = predict_accuracy(data,root)
   node.data = temp
   return acc

def traverse(data,node):
   global accuracy
   if(node.data == -1 or node.data == -2):
       return -1,None
   acc = prune(data,node)
   acc1,node1 = traverse(data,node.left)
   acc2,node2 = traverse(data,node.center)
   acc3,node3 = traverse(data,node.right)
   if(max(acc,acc1,acc2,acc3) >= accuracy):
       if(acc == max(acc,acc1,acc2,acc3)):
           return acc,node
       elif(acc1 == max(acc,acc1,acc2,acc3)):
           return acc1,node1
       elif(acc2 == max(acc,acc1,acc2,acc3)):
           return acc2,node2
       else:
           return acc3,node3
   else:
       return -1,None

c = no_nodes(root)
D1.append([c, predict_accuracy(gbdata0,root)])
D2.append([c, predict_accuracy(gbdata1,root)])
D3.append([c, predict_accuracy(gbdata2,root)])

while(True):
   acc,node = traverse(gbdata1,root)
   if(acc == -1):
       break
   accuracy = acc
   node.data = node.predict
   node.left = None
   node.center = None
   node.right = None
   node.predict = None
   count = no_nodes(root)
   D2.append([count, acc])
   tmp = root
   D1.append([count, predict_accuracy(gbdata0,tmp)])
   D3.append([count, predict_accuracy(gbdata2,tmp)])

D1 = np.array(D1)
D2 = np.array(D2)
D3 = np.array(D3)
pyplot.plot(D1[:,0],D1[:,1],label='$T_{1}(x)$')
pyplot.plot(D2[:,0],D2[:,1],label='$T_{1}(x)$')
pyplot.plot(D3[:,0],D3[:,1],label='$T_{1}(x)$')
pyplot.savefig('example03.png')

Navigate

I\'m new to python language and having trouble to write the following method for

I\'m new to swift and could really use some help with getting familiar with xcod

Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.

I\'m new to python language and having trouble to write the following method for

Question

Explanation / Answer

Related Questions

Navigate