Academic Integrity: tutoring, explanations, and feedback — we don’t complete graded work or submit on a student’s behalf.

You are provided with a dataset that includes data of people who are interviewed

ID: 3682781 • Letter: Y

Question

You are provided with a dataset that includes data of people who are interviewed for the Data Scientist job. The data is below and the fields are; each candidate’s level, their preferred language, whether they are active on Twitter and whether he/she has a PhD. The class label (last attribute) is either Yes (the candidate interviewed well) or No (the candidate interviewed poorly.

inputs = [

        ({'level':'Senior','lang':'Java','tweets':'no','phd':'no'},   False),

        ({'level':'Senior','lang':'Java','tweets':'no','phd':'yes'}, False),

        ({'level':'Mid','lang':'Python','tweets':'no','phd':'no'},     True),

        ({'level':'Junior','lang':'Python','tweets':'no','phd':'no'}, True),

        ({'level':'Junior','lang':'R','tweets':'yes','phd':'no'},      True),

        ({'level':'Junior','lang':'R','tweets':'yes','phd':'yes'},    False),

        ({'level':'Mid','lang':'R','tweets':'yes','phd':'yes'},        True),

        ({'level':'Senior','lang':'Python','tweets':'no','phd':'no'}, False),

        ({'level':'Senior','lang':'R','tweets':'yes','phd':'no'},      True),

        ({'level':'Junior','lang':'Python','tweets':'yes','phd':'no'}, True),

        ({'level':'Senior','lang':'Python','tweets':'yes','phd':'yes'},True),

        ({'level':'Mid','lang':'Python','tweets':'no','phd':'yes'},    True),

        ({'level':'Mid','lang':'Java','tweets':'yes','phd':'no'},      True),

        ({'level':'Junior','lang':'Python','tweets':'no','phd':'yes'},False)

    ]

Use Python for the development

You are asked using this data to build a model identifying which candidates will interview well? (so that your boss does not have to waste his time interviewing candidates!)

You are not asked to create the tree using data manually. You are supposed to write the program (using Python) to do it and the program should be able to accept other similar data and output the tree.

HINTS:
-Use decision trees, and ID3 decision tree
-Use Entropy to decide whether or not split a node, or on which attribute to split (do not use GINI)

Explanation / Answer

from __future__ import division
from collections import Counter, defaultdict
from functools import partial
import math


def entropy(class_probabilities):
    """given a list of class probabilities, compute entropy"""
    return sum(-p * math.log(p, 2)
               for p in class_probabilities
               if p) # ignore zero probabilities


def class_probabilities(labels):
    total_count = len(labels)
    return [count / total_count
            for count in Counter(labels).values()]


def data_entropy(labeled_data):
    labels = [label for _, label in labeled_data]
    probabilities = class_probabilities(labels)
    return entropy(probabilities)


def partition_entropy(subsets):
    """find the entropy from this partition of data into subsets.
        subsets is a list of lists of labeled data"""

    total_count = sum(len(subset) for subset in subsets)

    return sum(data_entropy(subset) * len(subset) / total_count
               for subset in subsets)


def partition_by(inputs, attribute):
    """each input is a parit (attribute_dict, labels).
        returns a dict : attribute_values -> inputs"""
    groups = defaultdict(list)
    for input in inputs:
        key = input[0][attribute]
        groups[key].append(input)
    return groups


def partition_entropy_by(inputs, attribute):
    """computes the entropy corresponding to the given partition"""
    partitions = partition_by(inputs, attribute)
    return partition_entropy(partitions.values())


def classify(tree, input):
    """classify the input using the given decision tree"""

    # if this is a leaf node, return its value
    if tree in [True, False]:
        return tree

    # otherwise this tree consists of an attribute to split on
    # and a dictionary whose keys are values of that attribute
    # and whose values of a subtrees to consider next
    attribute, subtree_dict = tree

    subtree_key = input.get(attribute) # none if input is missing attribute

    if subtree_key not in subtree_dict: # if no subtree for key
        subtree_key = None # use the default 'none' subtree

    subtree = subtree_dict[subtree_key] # selects appropriate subtree
    return classify(subtree, input)


def build_tree_id3(inputs, split_candidates=None):
    # if this is our first pass
    # all keys of the first inputs are split candidates
    if split_candidates is None:
        split_candidates = inputs[0][0].keys()

    # count Trues and Falses in the inputs
    num_inputs = len(inputs)
    num_trues = len([label for item, label in inputs if label])
    num_falses = num_inputs - num_trues

    if num_trues == 0: return False # no Trues? return a "False" leaf
    if num_falses == 0: return True # no Falses? return a "True" leaf

    if not split_candidates: # if no split candidates left
        return num_trues >= num_falses # return the majority leaf

    # otherwise, split on the best attribute
    best_attribute = min(split_candidates,
                         key=partial(partition_entropy_by, inputs))

    partitions = partition_by(inputs, best_attribute)
    new_candidates = [a for a in split_candidates
                      if a != best_attribute]

    # recursively build subtree
    subtrees = {attribute_value: build_tree_id3(subset, new_candidates)
                for attribute_value, subset in partitions.iteritems()}

    subtrees[None] = num_trues > num_falses # default case

    return best_attribute, subtrees

if __name__ == "__main__":

    inputs = [
        ({'level': 'Senior', 'lang': 'Java', 'tweets': 'no', 'phd': 'no'}, False),
        ({'level': 'Senior', 'lang': 'Java', 'tweets': 'no', 'phd': 'yes'}, False),
        ({'level': 'Mid', 'lang': 'Python', 'tweets': 'no', 'phd': 'no'}, True),
        ({'level': 'Junior', 'lang': 'Python', 'tweets': 'no', 'phd': 'no'}, True),
        ({'level': 'Junior', 'lang': 'R', 'tweets': 'yes', 'phd': 'no'}, True),
        ({'level': 'Junior', 'lang': 'R', 'tweets': 'yes', 'phd': 'yes'}, False),
        ({'level': 'Mid', 'lang': 'R', 'tweets': 'yes', 'phd': 'yes'}, True),
        ({'level': 'Senior', 'lang': 'Python', 'tweets': 'no', 'phd': 'no'}, False),
        ({'level': 'Senior', 'lang': 'R', 'tweets': 'yes', 'phd': 'no'}, True),
        ({'level': 'Junior', 'lang': 'Python', 'tweets': 'yes', 'phd': 'no'}, True),
        ({'level': 'Senior', 'lang': 'Python', 'tweets': 'yes', 'phd': 'yes'}, True),
        ({'level': 'Mid', 'lang': 'Python', 'tweets': 'no', 'phd': 'yes'}, True),
        ({'level': 'Mid', 'lang': 'Java', 'tweets': 'yes', 'phd': 'no'}, True),
        ({'level': 'Junior', 'lang': 'Python', 'tweets': 'no', 'phd': 'yes'}, False)
    ]

    for key in ['level', 'lang', 'tweets', 'phd']:
        print key, "%.2f" % partition_entropy_by(inputs, key)

    print "Building tree"
    tree = build_tree_id3(inputs)
    print tree

    print "Junior / Java / tweets / no phd", classify(tree, {"level": "Junior",
                                                             "lang": "Java",
                                                             "tweets": "yes",
                                                             "phd": "no"}) # True

    print "Junior / Java / tweets / phd", classify(tree, {"level": "Junior",
                                                          "lang": "Java",
                                                          "tweets": "yes",
                                                          "phd": "yes"}) # False

    print "Intern", classify(tree, {"level": "Intern"}) # True
    print "Senior", classify(tree, {"level": "Senior"}) # False

  
  
  
  
   

Hire Me For All Your Tutoring Needs
Integrity-first tutoring: clear explanations, guidance, and feedback.
Drop an Email at
drjack9650@gmail.com
Chat Now And Get Quote