Academic Integrity: tutoring, explanations, and feedback — we don’t complete graded work or submit on a student’s behalf.

You are provided with a dataset that includes data of people who are interviewed

ID: 3687208 • Letter: Y

Question

You are provided with a dataset that includes data of people who are interviewed for the Data Scientist job. The data is below and the fields are; each candidate’s level, their preferred language, whether they are active on Twitter and whether he/she has a PhD. The class label (last attribute) is either Yes (the candidate interviewed well) or No (the candidate interviewed poorly.

inputs = [

        ({'level':'Senior','lang':'Java','tweets':'no','phd':'no'},   False),

        ({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'}, False),

        ({'level':'Mid','lang':'Python','tweets':'no','phd':'no'},     True),

        ({'level':'Junior','lang':'Python','tweets':'no','phd':'no'}, True),

        ({'level':'Junior','lang':'R','tweets':'yes','phd':'no'},      True),

        ({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'},    False),

        ({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'},        True),

        ({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, False),

        ({'level':'Senior','lang':'R','tweets':'yes','phd':'no'},      True),

        ({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, True),

        ({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},True),

        ({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'},    True),

        ({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'},      True),

        ({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},False)

    ]

You are asked using this data to build a model identifying which candidates will interview well? (so that your boss does not have to waste his time interviewing candidates!)

HINTS:

-       Use decision trees, and ID3 decision tree

-       Use Entropy to decide whether or not split a node, or on which attribute to split (unlike in class, where we used mostly GINI)

Write a python code for above example.

Explanation / Answer

Given below are the code,with commets to make understand easily.

import math, random

def entropy(class_probabi):

'''list of prob in single node '''

   return sum(-p * math.log(p, 2) for p in class_probabi if p)

def class_probabi(labels):

    total_count = len(labels)

class_probabi = [count/total_count for count in Counter(labels).values()]

    return entropy(class_probabi)

   

def data_entropy(label):       

    labels = [label for _, label in label]

    probabilities = class_probabi(labels)

    return entropy(probabilities)

def partitionentropy(subsets):

    """find the subsets"""

    total_count = sum(len(subset) for subset in subsets)

   

    return sum( data_entropy(subset) * len(subset) / total_count

                for subset in subsets )

def group_by(itm, key):

    """ input item whose key is key(item)"""

    groups = defaultdict(list)

    for item in itm:

        key = key(item)

        groups[key].append(item)

    return groups

   

def partition(input_data, feature):

       return group_by(input_data, lambda x: x[0][feature])   

def partitionentropy(input_data,feature):

  """find the entropy """       

    partitions = partition(input_data, feature)

    return partitionentropy(partitions.values())       

def organize(tree, input):

    """organize the input data"""

   

    if tree in [Tru , Fal ]:

        return tree

  

    feature, subtree_dict = tree

   

    subtree_key = input.get(feature)

    if subtree_key not in subtree_dict:

        subtree_key = None             

   

    subtree = subtree_dict[subtree_key]

    return organize(subtree, input)    

def buildTree(input_data, splt_cand=None):

    if splt_cand is None:

        splt_cand = input_data[0][0].keys()

“count Trues and Falses in the input_data”

    num_input_data = len(input_data)

    num_Tru = len([label for item, label in input_data if label])

    num_Fal = num_input_data - num_Tru

   

    if num_Tru == 0:                  “if False ”

        return Fal                             

    if num_Fal == 0:                 “ if True “

return Tru                      

    if not splt_cand:           

        return num_Tru >= num_Fal

    top_feature = min(splt_cand,

        key=partial(partitionentropy, input_data))

    partitions = partition(input_data, top_feature)

    new_cand = [a for a in splt_cand “for split candidate"

                      if a != top_feature]

   

    # recursive subtrees

    subtrees = { feature : buildTree(subset, new_cand)

                 for feature, subset in partitions.iteritm() }

    subtrees[None] = num_Tru > num_Fal

    return (top_feature, subtrees)

def forest_organize(trees, input):

    pick = [organize(tree, input) for tree in trees]

    pcounts = Counter(pick)

    return pcounts.most_common(1)[0][0]

if __name__ == "__main__":

    input_data = [

        ({'level':'Senior','lang':'Java','tweets':'no','phd':'no'},   Fal ),

        ({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'}, Fal ),

        ({'level':'Mid','lang':'Python','tweets':'no','phd':'no'},     Tru ),

        ({'level':'Junior','lang':'Python','tweets':'no','phd':'no'}, Tru ),

        ({'level':'Junior','lang':'R','tweets':'yes','phd':'no'},      Tru ),

        ({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'},    Fal ),

        ({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'},        Tru ),

        ({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, Fal ),

        ({'level':'Senior','lang':'R','tweets':'yes','phd':'no'},      Tru ),

        ({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, Tru ),

        ({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},Tru ),

        ({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'},    Tru ),

        ({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'},      Tru ),

        ({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},Fal )

    ]

    for key in ['level','lang','tweets','phd']:

        print key, partitionentropy(input_data, key)

    print

    senior_input_data = [(input, label)

                     for input, label in input_data if input["level"] == "Senior"]

    for key in ['lang', 'tweets', 'phd']:

        print key, partitionentropy(senior_input_data, key)

    print

    print " the tree developing "

    tree = buildTree(input_data)

    print tree

    print "Junior / Java / tweets / no phd", organize(tree,

        { "level" : "Junior",

          "lang" : "Java",

          "tweets" : "yes",

          "phd" : "no"} )

    print "Junior / Java / tweets / phd", organize(tree,

        { "level" : "Junior",

                 "lang" : "Java",

                 "tweets" : "yes",

                 "phd" : "yes"} )

    print "Intern", organize(tree, { "level" : "Intern" } )

    print "Senior", organize(tree, { "level" : "Senior" } )

Hire Me For All Your Tutoring Needs
Integrity-first tutoring: clear explanations, guidance, and feedback.
Drop an Email at
drjack9650@gmail.com
Chat Now And Get Quote