Question
Use python programming language to write a program that computes the sentiments of each of the 100 reviews. You will find a zipped file containing 100 reviews from Yelp.
There are several ways to do this:
1. Use the AFFIN-111.txt file on Blackboard to get words and their associated sentiment. For each Yelp review, iterate through the words and get the cumulative sentiment score.
2. You may use Textblob to compute the sentiment score for each Yelp review.
3. NLTK has something called SentimentIntensityAnalyzer (see my slides for an example) to compute sentiments.
Use any two of the approaches shown above to compute the sentiments of each of the 100 Yelp reviews. A sample output using AFFIN-111.txt and Textblob will be as follows:
Review AFFIN-Sentiment Textblob-Sentiment
1 8 0.7
2 -3 -0.16
Link to the 100 yelp reviews : https://elearn.uta.edu/bbcswebdav/pid-7188776-dt-content-rid-131820819_2/xid-131820819_2
Link to the AFFIN-111.txt file: https://drive.google.com/open?id=18RW1L_Tm6fHGlMFK4KlEVWoKdC7AhH2d
Explanation / Answer
from pprint import pprint import nltk import yaml import sys import os import re class Splitter(object): def __init__(self): self.nltk_splitter = nltk.data.load('tokenizers/punkt/english.pickle') self.nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer() def split(self, text): """ input format: a paragraph of text output format: a list of lists of words. e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']] """ sentences = self.nltk_splitter.tokenize(text) tokenized_sentences = [self.nltk_tokenizer.tokenize(sent) for sent in sentences] return tokenized_sentences class POSTagger(object): def __init__(self): pass def pos_tag(self, sentences): """ input format: list of lists of words e.g.: [['this', 'is', 'a', 'sentence'], ['this', 'is', 'another', 'one']] output format: list of lists of tagged tokens. Each tagged tokens has a form, a lemma, and a list of tags e.g: [[('this', 'this', ['DT']), ('is', 'be', ['VB']), ('a', 'a', ['DT']), ('sentence', 'sentence', ['NN'])], [('this', 'this', ['DT']), ('is', 'be', ['VB']), ('another', 'another', ['DT']), ('one', 'one', ['CARD'])]] """ pos = [nltk.pos_tag(sentence) for sentence in sentences] #adapt format pos = [[(word, word, [postag]) for (word, postag) in sentence] for sentence in pos] return pos class DictionaryTagger(object): def __init__(self, dictionary_paths): files = [open(path, 'r') for path in dictionary_paths] dictionaries = [yaml.load(dict_file) for dict_file in files] map(lambda x: x.close(), files) self.dictionary = {} self.max_key_size = 0 for curr_dict in dictionaries: for key in curr_dict: if key in self.dictionary: self.dictionary[key].extend(curr_dict[key]) else: self.dictionary[key] = curr_dict[key] self.max_key_size = max(self.max_key_size, len(key)) def tag(self, postagged_sentences): return [self.tag_sentence(sentence) for sentence in postagged_sentences] def tag_sentence(self, sentence, tag_with_lemmas=False): """ the result is only one tagging of all the possible ones. The resulting tagging is determined by these two priority rules: - longest matches have higher priority - search is made from left to right """ tag_sentence = [] N = len(sentence) if self.max_key_size == 0: self.max_key_size = N i = 0 while (i < N): j = min(i + self.max_key_size, N) #avoid overflow tagged = False while (j > i): expression_form = ' '.join([word[0] for word in sentence[i:j]]).lower() expression_lemma = ' '.join([word[1] for word in sentence[i:j]]).lower() if tag_with_lemmas: literal = expression_lemma else: literal = expression_form if literal in self.dictionary: #self.logger.debug("found: %s" % literal) is_single_token = j - i == 1 original_position = i i = j taggings = [tag for tag in self.dictionary[literal]] tagged_expression = (expression_form, expression_lemma, taggings) if is_single_token: #if the tagged literal is a single token, conserve its previous taggings: original_token_tagging = sentence[original_position][2] tagged_expression[2].extend(original_token_tagging) tag_sentence.append(tagged_expression) tagged = True else: j = j - 1 if not tagged: tag_sentence.append(sentence[i]) i += 1 return tag_sentence def value_of(sentiment): if sentiment == 'positive': return 1 if sentiment == 'negative': return -1 return 0 def sentence_score(sentence_tokens, previous_token, acum_score): if not sentence_tokens: return acum_score else: current_token = sentence_tokens[0] tags = current_token[2] token_score = sum([value_of(tag) for tag in tags]) if previous_token is not None: previous_tags = previous_token[2] if 'inc' in previous_tags: token_score *= 2.0 elif 'dec' in previous_tags: token_score /= 2.0 elif 'inv' in previous_tags: token_score *= -1.0 return sentence_score(sentence_tokens[1:], current_token, acum_score + token_score) def sentiment_score(review): return sum([sentence_score(sentence, None, 0.0) for sentence in review]) if __name__ == "__main__": text = """What can I say about this place. The staff of the restaurant is nice and the eggplant is not bad. Apart from that, very uninspired food, lack of atmosphere and too expensive. I am a staunch vegetarian and was sorely dissapointed with the veggie options on the menu. Will be the last time I visit, I recommend others to avoid.""" splitter = Splitter() postagger = POSTagger() dicttagger = DictionaryTagger([ 'dicts/positive.yml', 'dicts/negative.yml', 'dicts/inc.yml', 'dicts/dec.yml', 'dicts/inv.yml']) splitted_sentences = splitter.split(text) pprint(splitted_sentences) pos_tagged_sentences = postagger.pos_tag(splitted_sentences) pprint(pos_tagged_sentences) dict_tagged_sentences = dicttagger.tag(pos_tagged_sentences) pprint(dict_tagged_sentences) print("analyzing sentiment...") score = sentiment_score(dict_tagged_sentences) print(score)