Academic Integrity: tutoring, explanations, and feedback — we don’t complete graded work or submit on a student’s behalf.

I\'m new to python language and having trouble to write the following method for

ID: 3663116 • Letter: I

Question

I'm new to python language and having trouble to write the following method for a decision tree. Any help would be appreciate.

def find_best_split(data):
if len(data) < 2:
return None, None
best_feature = None
best_threshold = None
best_gain = 0
# TODO: find the feature and threshold that maximize information gain.
return (best_feature, best_threshold)

Here are the given two classes for this assignment:

//////////////////////// Tree class starts ////////////////////////////////

from math import log

class Tree:
leaf = True
prediction = None
feature = None
threshold = None
left = None
right = None

def predict(tree, point):
if tree.leaf:
return tree.prediction
i = tree.feature
if (point.values[i] < tree.threshold):
return predict(tree.left, point)
else:
return predict(tree.right, point)

def most_likely_class(prediction):
labels = list(prediction.keys())
probs = list(prediction.values())
return labels[probs.index(max(probs))]

def accuracy(data, predictions):
total = 0
correct = 0
for i in range(len(data)):
point = data[i]
pred = predictions[i]
total += 1
guess = most_likely_class(pred)
if guess == point.label:
correct += 1
return float(correct) / total

def split_data(data, feature, threshold):
left = []
right = []
# TODO: split data into left and right by given feature.
# left should contain points whose values are less than threshold
# right should contain points with values greater than or equal to threshold
return (left, right)

def count_labels(data):
counts = {}
# TODO: counts should count the labels in data
# e.g. counts = {'spam': 10, 'ham': 4}
return counts

def counts_to_entropy(counts):
entropy = 0.0
# TODO: should convert a dictionary of counts into entropy
return entropy
  
def get_entropy(data):
counts = count_labels(data)
entropy = counts_to_entropy(counts)
return entropy

# This is a correct but inefficient way to find the best threshold to maximize
# information gain.
def find_best_threshold(data, feature):
entropy = get_entropy(data)
best_gain = 0
best_threshold = None
for point in data:
left, right = split_data(data, feature, point.values[feature])
curr = (get_entropy(left)*len(left) + get_entropy(right)*len(right))/len(data)
gain = entropy - curr
if gain > best_gain:
best_gain = gain
best_threshold = point.values[feature]
return (best_gain, best_threshold)

def find_best_threshold_fast(data, feature):
entropy = get_entropy(data)
best_gain = 0
best_threshold = None
# TODO: Write a more efficient method to find the best threshold.
return (best_gain, best_threshold)

def find_best_split(data):
if len(data) < 2:
return None, None
best_feature = None
best_threshold = None
best_gain = 0
# TODO: find the feature and threshold that maximize information gain.
return (best_feature, best_threshold)

def make_leaf(data):
tree = Tree()   
counts = count_labels(data)
prediction = {}
for label in counts:
prediction[label] = float(counts[label])/len(data)
tree.prediction = prediction
return tree

def c45(data, max_levels):
if max_levels <= 0:
return make_leaf(data)
# TODO: Construct a decision tree with the data and return it.
# Your algorithm should return a leaf if the maximum level depth is reached
# or if there is no split that gains information, otherwise it should greedily
# choose an feature and threshold to split on and recurse on both partitions
# of the data.
return make_leaf(data)

def submission(train, test):
# TODO: Once your tests pass, make your submission as good as you can!
tree = c45(train, 4)
predictions = []
for point in test:
predictions.append(predict(tree, point))
return predictions

# This might be useful for debugging.
def print_tree(tree):
if tree.leaf:
print "Leaf", tree.prediction
else:
print "Branch", tree.feature, tree.threshold
print_tree(tree.left)
print_tree(tree.right)

//////////////////////////////////// Tree class ends //////////////////////////////

////////////////////////// Point class begins //////////////////////////

class Point:
def __str__(self):
return "<" + self.label + ": " + `self.values` + ">"
def __repr__(self):
return "<" + self.label + ": " + `self.values` + ">"
def __init__(self, label, values):
self.label = label
self.values = values

def get_label(s, labels):
for l in labels:
if l in s:
return l
raise Exception('Label not found', s)

def string_statistics(s):
s = s.lower()
values = [0]*32
total = len(s)
for c in s:
n = ord(c)
if 97 <= n <= 122:
values[n-97] += 1
elif c == '.':
values[26] += 1
elif c == ',':
values[27] += 1
elif c == '?':
values[28] += 1
elif c == '!':
values[29] += 1
elif c in '0123456789':
values[30] += 1
else:
values[31] += 1
for i in range(len(values)):
values[i] /= float(total)
return values

def get_data(filename, labels):
f = open(filename)
data = []
for line in f:
line = line.strip()
label = get_label(line.split("."), labels)
s = open(line).read()
values = string_statistics(s)
data.append(Point(label, values))
return data

def get_spam_train_data():
labels = ("ham", "spam")
return get_data("spam/train.list", labels)

def get_spam_valid_data():
labels = ("ham", "spam")
return get_data("spam/valid.list", labels)

def get_college_data():
data = [
Point('College', [24, 40000]),
Point('No College', [53, 52000]),
Point('No College', [23, 25000]),
Point('College', [25, 77000]),
Point('College', [32, 48000]),
Point('College', [52, 110000]),
Point('College', [22, 38000]),
Point('No College', [43, 44000]),
Point('No College', [52, 27000]),
Point('College', [48, 65000])
]
return data

Explanation / Answer

=> train.dat for training the tree
=> test.dat for testing the tree

import sys
import math
import copy
import matplotlib
import numpy as np
import matplotlib.pyplot as pyplot

with open("train.data") as foo:
   content = foo.readlines()
data = []
for i in xrange(len(content)):
   data.append((content[i].split(' ')[0]).split(','))

with open("validation.data") as foo1:
   content = foo1.readlines()
gbdata1 = []
for i in xrange(len(content)):
   gbdata1.append((content[i].split(' ')[0]).split(','))

with open("test.data") as foo2:
   content = foo2.readlines()
gbdata2 = []
for i in xrange(len(content)):
   gbdata2.append((content[i].split(' ')[0]).split(','))

gbdata0 = data

D1 = []
D2 = []
D3 = []

no_attributes = len(data[0]) - 1
list_available = [(i+1) for i in xrange(no_attributes)]

class Tree(object):
   def __init__(self):
       self.left = None
       self.center = None
       self.right = None
       self.data = None
       self.predict = None


def predict_accuracy(data_test,node):
   count = 0
   temp = node
   for i in xrange(len(data_test)):
       node = temp
       while(True):
           if(node.data == -1):
               if(data_test[i][0] == 'democrat'):
                   count += 1
               break
           if(node.data == -2):
               if(data_test[i][0] == 'republican'):
                   count += 1
               break
           if(data_test[i][node.data] == 'y'):
               node = node.right
           elif(data_test[i][node.data] == 'n'):
               node = node.left
           else:
               node = node.center
   accuracy = float(count)/len(data_test)
   return accuracy


root = Tree()
counter = 0

def growtree(data,available,node):

   global gbdata0,gbdata1,gbdata2,counter
   no_examples = len(data)
  
   if(no_examples == 0):
       node.data = -1
       return
  
   no_democrat = 0
   no_republican = 0
   i = 0
   while(i<no_examples):
       if(data[i][0] == 'democrat'):
           no_democrat += 1
       else:
           no_republican += 1
       i += 1

   if(no_democrat == 0):
       node.data = -2
       return
   if(no_republican == 0):
       node.data = -1
       return

   if(len(available) == 0):
       if(no_republican > no_democrat):
           node.data = -2
           return
       else:
           node.data = -1
           return

   error = float("inf")
   feature = 0
   c_0 = 0
   c_1 = 0
   c_2 = 0
   for j in xrange(len(available)):
       no_0 = 0
       total_0 = 0
       no_1 = 0
       total_1 = 0
       no_2 = 0
       total_2 = 0
       temp = 0
       for k in xrange(no_examples):
           if data[k][available[j]] == 'n':
               total_0 += 1
               if data[k][0] == 'democrat':
                   no_0 += 1
           elif data[k][available[j]] == 'y':
               total_2 += 1
               if data[k][0] == 'democrat':
                   no_2 += 1
           else:
               total_1 += 1
               if data[k][0] == 'democrat':
                   no_1 += 1
      
       if(total_0 != 0):
           if(no_0 == 0):
               entropy0 = -1*((total_0-no_0)*math.log(total_0-no_0) - total_0*math.log(total_0))
           elif(no_0 == total_0):
               entropy0 = -1*(no_0*math.log(no_0) - total_0*math.log(total_0))
           else:
               entropy0 = -1*(no_0*math.log(no_0) + (total_0-no_0)*math.log(total_0-no_0) - total_0*math.log(total_0))
       else:
           entropy0 = 0

       if(total_1 != 0):
           if(no_1 == 0):
               entropy1 = -1*((total_1-no_1)*math.log(total_1-no_1) - total_1*math.log(total_1))
           elif(no_1 == total_1):
               entropy1 = -1*(no_1*math.log(no_1) - total_1*math.log(total_1))
           else:
               entropy1 = -1*(no_1*math.log(no_1) + (total_1-no_1)*math.log(total_1-no_1) - total_1*math.log(total_1))
       else:
           entropy1 = 0

       if(total_2 != 0):
           if(no_2 == 0):
               entropy2 = -1*((total_2-no_2)*math.log(total_2-no_2) - total_2*math.log(total_2))
           elif(no_2 == total_2):
               entropy2 = -1*(no_2*math.log(no_2) - total_2*math.log(total_2))
           else:
               entropy2 = -1*(no_2*math.log(no_2) + (total_2-no_2)*math.log(total_2-no_2) - total_2*math.log(total_2))
       else:
           entropy2 = 0
      
       temp += entropy0 + entropy1 + entropy2
       if temp<error:
           error = temp
           feature = available[j]
           c_0 = no_0
           c_1 = no_1
           c_2 = no_2
  
   node.data = feature
   if((c_0+c_1+c_2) >= (no_examples-(c_0+c_1+c_2))):
       node.predict = -1
   else:
       node.predict = -2
   data0 = []
   data1 = []
   data2 = []
   for j in xrange(no_examples):
       if(data[j][feature] == 'n'):
           data0.append(data[j])
       elif(data[j][feature] == 'y'):
           data2.append(data[j])
       else:
           data1.append(data[j])

   if(c_0 >= len(data0)-c_0):
       node.left = Tree()
       node.left.data = -1
   else:
       node.left = Tree()
       node.left.data = -2
  
   if(c_1 >= len(data1)-c_1):
       node.center = Tree()
       node.center.data = -1
   else:
       node.center = Tree()
       node.center.data = -2
  
   if(c_2 >= len(data2)-c_2):
       node.right = Tree()
       node.right.data = -1
   else:
       node.right = Tree()
       node.right.data = -2

   counter += 3
   tmp = root
   #D1.append([counter, predict_accuracy(gbdata0,tmp)])
   #D2.append([counter, predict_accuracy(gbdata1,tmp)])
   #D3.append([counter, predict_accuracy(gbdata2,tmp)])
  
   available.remove(feature)
   available0 = available[:]
   available1 = available[:]
   available2 = available[:]
   node.left = Tree()
   growtree(data0,available0,node.left)
   node.center = Tree()
   growtree(data1,available1,node.center)
   node.right = Tree()
   growtree(data2,available2,node.right)

growtree(data,list_available,root)

accuracy = predict_accuracy(gbdata1,root)
D = []

def no_nodes(node):
   if(node.data == -1 or node.data == -2):
       return 1
   else:
       return 1 + no_nodes(node.left) + no_nodes(node.center) + no_nodes(node.right)

def prune(data,node):
   global root
   temp = node.data
   node.data = node.predict
   acc = predict_accuracy(data,root)
   node.data = temp
   return acc

def traverse(data,node):
   global accuracy
   if(node.data == -1 or node.data == -2):
       return -1,None
   acc = prune(data,node)
   acc1,node1 = traverse(data,node.left)
   acc2,node2 = traverse(data,node.center)
   acc3,node3 = traverse(data,node.right)
   if(max(acc,acc1,acc2,acc3) >= accuracy):
       if(acc == max(acc,acc1,acc2,acc3)):
           return acc,node
       elif(acc1 == max(acc,acc1,acc2,acc3)):
           return acc1,node1
       elif(acc2 == max(acc,acc1,acc2,acc3)):
           return acc2,node2
       else:
           return acc3,node3
   else:
       return -1,None

c = no_nodes(root)
D1.append([c, predict_accuracy(gbdata0,root)])
D2.append([c, predict_accuracy(gbdata1,root)])
D3.append([c, predict_accuracy(gbdata2,root)])

while(True):
   acc,node = traverse(gbdata1,root)
   if(acc == -1):
       break
   accuracy = acc
   node.data = node.predict
   node.left = None
   node.center = None
   node.right = None
   node.predict = None
   count = no_nodes(root)
   D2.append([count, acc])
   tmp = root
   D1.append([count, predict_accuracy(gbdata0,tmp)])
   D3.append([count, predict_accuracy(gbdata2,tmp)])

D1 = np.array(D1)
D2 = np.array(D2)
D3 = np.array(D3)
pyplot.plot(D1[:,0],D1[:,1],label='$T_{1}(x)$')
pyplot.plot(D2[:,0],D2[:,1],label='$T_{1}(x)$')
pyplot.plot(D3[:,0],D3[:,1],label='$T_{1}(x)$')
pyplot.savefig('example03.png')

Hire Me For All Your Tutoring Needs
Integrity-first tutoring: clear explanations, guidance, and feedback.
Drop an Email at
drjack9650@gmail.com
Chat Now And Get Quote