You are provided with a dataset that includes data of people who are interviewed
ID: 3687208 • Letter: Y
Question
You are provided with a dataset that includes data of people who are interviewed for the Data Scientist job. The data is below and the fields are; each candidate’s level, their preferred language, whether they are active on Twitter and whether he/she has a PhD. The class label (last attribute) is either Yes (the candidate interviewed well) or No (the candidate interviewed poorly.
inputs = [
({'level':'Senior','lang':'Java','tweets':'no','phd':'no'}, False),
({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'}, False),
({'level':'Mid','lang':'Python','tweets':'no','phd':'no'}, True),
({'level':'Junior','lang':'Python','tweets':'no','phd':'no'}, True),
({'level':'Junior','lang':'R','tweets':'yes','phd':'no'}, True),
({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'}, False),
({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'}, True),
({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, False),
({'level':'Senior','lang':'R','tweets':'yes','phd':'no'}, True),
({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, True),
({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},True),
({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'}, True),
({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'}, True),
({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},False)
]
You are asked using this data to build a model identifying which candidates will interview well? (so that your boss does not have to waste his time interviewing candidates!)
HINTS:
- Use decision trees, and ID3 decision tree
- Use Entropy to decide whether or not split a node, or on which attribute to split (unlike in class, where we used mostly GINI)
Write a python code for above example.
Explanation / Answer
Given below are the code,with commets to make understand easily.
import math, random
def entropy(class_probabi):
'''list of prob in single node '''
return sum(-p * math.log(p, 2) for p in class_probabi if p)
def class_probabi(labels):
total_count = len(labels)
class_probabi = [count/total_count for count in Counter(labels).values()]
return entropy(class_probabi)
def data_entropy(label):
labels = [label for _, label in label]
probabilities = class_probabi(labels)
return entropy(probabilities)
def partitionentropy(subsets):
"""find the subsets"""
total_count = sum(len(subset) for subset in subsets)
return sum( data_entropy(subset) * len(subset) / total_count
for subset in subsets )
def group_by(itm, key):
""" input item whose key is key(item)"""
groups = defaultdict(list)
for item in itm:
key = key(item)
groups[key].append(item)
return groups
def partition(input_data, feature):
return group_by(input_data, lambda x: x[0][feature])
def partitionentropy(input_data,feature):
"""find the entropy """
partitions = partition(input_data, feature)
return partitionentropy(partitions.values())
def organize(tree, input):
"""organize the input data"""
if tree in [Tru , Fal ]:
return tree
feature, subtree_dict = tree
subtree_key = input.get(feature)
if subtree_key not in subtree_dict:
subtree_key = None
subtree = subtree_dict[subtree_key]
return organize(subtree, input)
def buildTree(input_data, splt_cand=None):
if splt_cand is None:
splt_cand = input_data[0][0].keys()
“count Trues and Falses in the input_data”
num_input_data = len(input_data)
num_Tru = len([label for item, label in input_data if label])
num_Fal = num_input_data - num_Tru
if num_Tru == 0: “if False ”
return Fal
if num_Fal == 0: “ if True “
return Tru
if not splt_cand:
return num_Tru >= num_Fal
top_feature = min(splt_cand,
key=partial(partitionentropy, input_data))
partitions = partition(input_data, top_feature)
new_cand = [a for a in splt_cand “for split candidate"
if a != top_feature]
# recursive subtrees
subtrees = { feature : buildTree(subset, new_cand)
for feature, subset in partitions.iteritm() }
subtrees[None] = num_Tru > num_Fal
return (top_feature, subtrees)
def forest_organize(trees, input):
pick = [organize(tree, input) for tree in trees]
pcounts = Counter(pick)
return pcounts.most_common(1)[0][0]
if __name__ == "__main__":
input_data = [
({'level':'Senior','lang':'Java','tweets':'no','phd':'no'}, Fal ),
({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'}, Fal ),
({'level':'Mid','lang':'Python','tweets':'no','phd':'no'}, Tru ),
({'level':'Junior','lang':'Python','tweets':'no','phd':'no'}, Tru ),
({'level':'Junior','lang':'R','tweets':'yes','phd':'no'}, Tru ),
({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'}, Fal ),
({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'}, Tru ),
({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, Fal ),
({'level':'Senior','lang':'R','tweets':'yes','phd':'no'}, Tru ),
({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, Tru ),
({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},Tru ),
({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'}, Tru ),
({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'}, Tru ),
({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},Fal )
]
for key in ['level','lang','tweets','phd']:
print key, partitionentropy(input_data, key)
senior_input_data = [(input, label)
for input, label in input_data if input["level"] == "Senior"]
for key in ['lang', 'tweets', 'phd']:
print key, partitionentropy(senior_input_data, key)
print " the tree developing "
tree = buildTree(input_data)
print tree
print "Junior / Java / tweets / no phd", organize(tree,
{ "level" : "Junior",
"lang" : "Java",
"tweets" : "yes",
"phd" : "no"} )
print "Junior / Java / tweets / phd", organize(tree,
{ "level" : "Junior",
"lang" : "Java",
"tweets" : "yes",
"phd" : "yes"} )
print "Intern", organize(tree, { "level" : "Intern" } )
print "Senior", organize(tree, { "level" : "Senior" } )
Related Questions
drjack9650@gmail.com
Navigate
Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.