Academic Integrity: tutoring, explanations, and feedback — we don’t complete graded work or submit on a student’s behalf.

**The bigram and check method are completed, but having issues with the generate

ID: 3847727 • Letter: #

Question

**The bigram and check method are completed, but having issues with the generate method.

The text files couldnt be provided due to the inappropriated words.

------------------------------ Bigram.java -------------------------------

import java.util.ArrayList;

import java.util.HashSet;

import java.util.TreeMap;

import java.util.Scanner;

import java.util.Set;

public class Bigram {

public Set set = new HashSet();

public TreeMap> gMap = new TreeMap<>();

public Bigram(String s) {

....

}

public boolean check(String s) {

....

}

public void createMapBigram(String s){

Scanner sc = new Scanner(s);

String previous = sc.next();

while(sc.hasNext()){

String next = sc.next();

if(gMap.containsKey(previous)){

ArrayList arr = new ArrayList();

arr = gMap.get(previous);

arr.add(next);

gMap.remove(previous);

gMap.put(previous, arr);

}else{

ArrayList arr = new ArrayList();

arr.add(next);

gMap.put(previous, arr);

}

previous = next;

}

if(gMap.containsKey(previous)){

ArrayList arr = new ArrayList();

arr = gMap.get(previous);

arr.add("");

gMap.remove(previous);

gMap.put(previous, arr);

}else{

ArrayList arr = new ArrayList();

arr.add("");

gMap.put(previous, arr);

}

}

public String[] generate(String start, int count) {

}

}

------------------------------ BigramTest.java -------------------------------

import java.nio.file.Files;
import java.nio.file.Paths;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Arrays;

public class BigramTest {

   public static int test(String file, byte[] xmd5, String[] gen, String[] desired, String[] check, boolean[] truth)
           throws NoSuchAlgorithmException {
       System.out.println("Loading " + file + "...");
       String text;
       try {
           text = new String(Files.readAllBytes(Paths.get(file)));
       } catch (Exception e) {
           System.out.println("Couldn't find '" + file
                   + "'. Please place this file in the root directory of this project (next to JRE System Library, not indented).");
           return 0;
       }
       MessageDigest md5 = MessageDigest.getInstance("MD5");
       byte[] digest = md5.digest(text.replaceAll("\s+", " ").getBytes());
       //System.out.println(Arrays.toString(digest));
       if (!Arrays.equals(digest, xmd5)) {
           System.out.println("Your copy of " + file + " appears to contain errors! Please download it again.");
           return 0;
       }
       System.out.println("Loaded " + file + ". Initializing Bigram object...");
       long start = System.currentTimeMillis();
       Bigram u = new Bigram(text);
       System.out.println("Generating.");
       int genScore = 0;
       for (int i = 0; i < gen.length; i++) {
           String[] foo = u.generate(gen[i], 10);
           if (foo == null) {
               System.out.println("For start word " + gen[i] + " with 10 words, you returned a null array!");
               continue;
           }
           String gened = "";
           for (int j = 0; j < foo.length; j++) {
               gened = gened + foo[j] + (j < foo.length - 1 ? " " : "");
           }
           if (gened.equals(desired[i])) {
               genScore += 10;
           } else {
               System.out.println("For start word " + gen[i] + " with 10 words, expected '" + desired[i] + "' got '"
                       + gened + "'.");
           }
       }
       System.out.println("Checking.");
       int checkScore = 0;
       for (int i = 0; i < check.length; i++) {
           boolean ck = u.check(check[i]);
           if (ck == truth[i]) {
               checkScore += 10;
           } else {
               System.out
                       .println("For phrase '" + check[i] + "' expected return value " + truth[i] + " but got " + ck);
           }
       }
       long end = System.currentTimeMillis();
       // Attempt at a benchmark...
       Arrays.sort(text.toLowerCase().toUpperCase().split("\s"));
       Arrays.sort(text.toUpperCase().toLowerCase().toCharArray());
       Arrays.sort(text.split("\s"));
       Arrays.sort(text.getBytes());
       long sortime = System.currentTimeMillis();
       //System.out.println((double)(end-start-5)/(sortime - end));
       if ((double)(end - start - 5)/(sortime - end) > 8) {
           System.out.println("Your program is taking a while! Try speeding it up for extra credit.");
       } else if ((double)(end - start - 5)/(sortime - end) > 2) {
           System.out.println("Fast, but could be faster! Takes "+(end-start)+" ms, try to get it below ~"+(2*(sortime - end)+5));
           genScore += 1;
       } else {
           System.out.println("Super fast! Took "+(end - start)+" ms");
           genScore += 1;
           checkScore += 1;
       }
       return genScore * 100 + checkScore;
   }

   public static void main(String[] args) throws NoSuchAlgorithmException {
       final byte[] dmd5 = { -61, 106, 118, -21, 62, -73, 33, 75, 68, -48, 38, 39, 108, 27, 95, -44 };
       final byte[] gmd5 = { -59, 120, 53, -92, 81, 59, -34, 72, 56, 2, 112, -125, 127, 50, -42, 55 };
       int checkScore = 0, genScore = 0;
       try {
           System.out.println("Trying 'Bob' example from homework.");
           Bigram x = new Bigram("Bob likes dogs. Bill likes cats. Jane hates dogs.");
           if (x.check("Bob likes cats.")) {
               checkScore += 10;
           } else {
               System.out.println("First check failed.");
           }
           if (!x.check("Jane likes cats.")) {
               checkScore += 10;
           } else {
               System.out.println("Second check failed.");
           }
           System.out.println("Trying 'Balloon' example from homework.");
           Bigram y = new Bigram("The balloon was red. The balloon got bigger and bigger. The balloon popped.");
           String[] g1 = y.generate("The", 3);
           if (Arrays.equals(g1, new String[] { "The", "balloon", "got" })) {
               genScore += 10;
           } else {
               System.out.println("First generate failed. Got " + Arrays.toString(g1));
           }
           String[] g2 = y.generate("popped.", 2);
           if (Arrays.equals(g2, new String[] { "popped." })) {
               genScore += 10;
           } else {
               System.out.println("Second generate failed. Got " + Arrays.toString(g2));
           }

           System.out.println("Testing with the Declaration of Independence...");
           int dscores = test("decl.txt", dmd5, new String[] { "When" },
                   new String[] { "When in the most barbarous ages, and to the most" },
                   new String[] { "We have Petitioned for the rectitude of this Declaration,",
                           "instrument for pretended offences For abolishing" },
                   new boolean[] { true, true });
           genScore += dscores / 100;
           checkScore += dscores % 100;

           System.out.println("Testing with Great Expectations...");
           int gscores = test("gexp.txt", gmd5, new String[] { "Pip", "dozen" },
                   new String[] { "Pip and I had been a little while, and I",
                           "dozen yards of the same time to be a little" },
                   new String[] { "low leaden hue" }, new boolean[] { false });
           genScore += gscores / 100;
           checkScore += gscores % 100;

       } finally {
           System.out.println("Check: " + checkScore + " / 50");
           System.out.println("Generate: " + genScore + " / 50");
           System.out.println("Tentative total: " + (checkScore + genScore + " / 100"));
           System.out.println("Violations of the academic honesty policy may affect this score.");
       }
   }

}

------------------------------ Hint -------------------------------

Your phrase generation method will be given a start word and a count indicating the number of total words to generate (including the start word). It will generate the "most likely" or "most common" phrase based on bigram counts. It will return an array of Strings with the words generated in order. It always starts by generating the start word. As you generate each word, the next word generated should be the one that appears most often in the input (constructor) text after the previous word generated. If you reach a dead end (either the previous word was never seen or there are no words ever seen after that word), end generation early and return a shorter array. If there is more than one "most common" choice seen in the input text, pick the one with the smallest word according to the String compareTo method (NOTE: OrderedSets and OrderedMaps such as TreeSets and TreeMaps order their set (or set of keys) according to compareTo.) Example: Bigram y new Bigram("The balloon was red. The balloon got bigger and bigger. The balloon popped."); y.generate("The", 3) returns the String array ["The", "balloon", "got" y generate("popped 2) returns "popped. A tester program will be released which will test multiple larger examples. Your code should be able to work with input text containing up to a million words.

Explanation / Answer

The program to generate the bigram counter involves with training the data set with the sample texts and then based on the bigram frequencies generated, the list is generated, The program for bigram count is as folows,

  

import java.util.regex.Pattern; import java.util.regex.Matcher; import java.util.HashMap; import java.util.Iterator; import java.util.Set; import java.util.HashSet; import java.util.Stack; public class Bigram { public Set<String> samples; public HashMap<String, HashMap<String, Double>> counts; public HashMap<String, Double> unigramCounts; public final String START = ":S"; // For add-one smoothing public Set<String> wordSet; // Used to find the vocabulary public double vocabSize; // Size of the vocabulary // For Good Turing Smoothing public double numTrainingBigrams; // The size of the training set (# non-distinct words) public HashMap<Double, Double> numberOfBigramsWithCount; // The number of bigrams that occur x times public boolean goodTuringCountsAvailable = false; // True when good turing counts are available public static void main(String[] args) { if (args.length != 2) { System.out.println("You must supply 2 arguments: (1) Training file " + "(2) Test file"); System.exit(1); } NgramParser p = new NgramParser(args[0], true); HashSet<String> set = p.parse(); Bigram b = new Bigram(set); b.train(); System.out.println("Done training."); //System.out.println(b.getSentence()); NgramParser test = new NgramParser(args[1], true); HashSet<String> testset = test.parse(); System.out.println("Perplexity of the test set: " + b.perplexity(testset)); } public Bigram(Set<String> samples) { this.samples = samples; this.counts = new HashMap<String, HashMap<String, Double>>(); this.unigramCounts = new HashMap<String, Double>(); this.wordSet = new HashSet<String>(); this.numberOfBigramsWithCount = new HashMap<Double, Double>(); } public void train() { // Regexp to match words (starting with optional apos) or any punctuation (with probably extra escaping) String regexp = "('?\w+|\p{Punct})"; Pattern pattern = Pattern.compile(regexp); for (String sample : samples) { Matcher matcher = pattern.matcher(sample); String previousWord = START; // originally set to beginning-of-sentence marker while (matcher.find()) { // Set unigram counts (for word1) double unigramCount = 0.0; if (unigramCounts.containsKey(previousWord)) { unigramCount = unigramCounts.get(previousWord); } unigramCounts.put(previousWord, unigramCount+1.0); // Get the new match (word2) String match = matcher.group(); wordSet.add(match); // Get access to (or create) the count map for word1. HashMap<String, Double> innerCounts; if (counts.containsKey(previousWord)) { innerCounts = counts.get(previousWord); } else { innerCounts = new HashMap<String, Double>(); counts.put(previousWord, innerCounts); } // Add to the size of the training set for gt-smoothing numTrainingBigrams += 1; // Set bigram counts double count = 0.0; if (innerCounts.containsKey(match)) { count = innerCounts.get(match); // Decrement the number of bigrams with old count for gt-smoothing numberOfBigramsWithCount.put(count, numberOfBigramsWithCount.get(count) - 1.0); } innerCounts.put(match, count+1.0); // Increment the number of bigrams with the new count for gt-smoothing if (!numberOfBigramsWithCount.containsKey(count+1.0)) { numberOfBigramsWithCount.put(count+1.0, 1.0); } else { numberOfBigramsWithCount.put(count+1.0, numberOfBigramsWithCount.get(count+1.0) + 1.0); } // Update previousWord previousWord = match; } } vocabSize = wordSet.size(); } public double count(String word1, String word2) { if (counts.containsKey(word1) && counts.get(word1).containsKey(word2)) { return counts.get(word1).get(word2); } return 0.0; } public double unigramCount(String word) { if (unigramCounts.containsKey(word)) { return unigramCounts.get(word); } return 0.0; } public double unsmoothedProbability(String word1, String word2) { if (counts.containsKey(word1)) { if (counts.get(word1).containsKey(word2)) { return counts.get(word1).get(word2) / unigramCounts.get(word1); } else { return 0.0; } } else { return 0.0; } } public double addOneSmoothedProbability(String word1, String word2) { // (count(word1 word2) + 1) / (count(word1) + V) return (count(word1, word2) + 1.0) / (unigramCount(word1) + vocabSize); } public double goodTuringSmoothedProbability(String word1, String word2) { if (!goodTuringCountsAvailable) { System.out.println("Making good turing counts..."); makeGoodTuringCounts(); System.out.println("Done making good turing counts."); } // If this bigram has occurred, return good turing probability double gtcount = count(word1, word2); if (gtcount > 0.0) { return gtcount / unigramCount(word1); } // Otherwise, return N1/N as per book (page 101?) return numberOfBigramsWithCount.get(1.0) / numTrainingBigrams; } public void makeGoodTuringCounts() { // Generate good turing counts for (String word1 : counts.keySet()) { HashMap<String, Double> innerMap = counts.get(word1); double unigramCount = 0; for (String word2 : innerMap.keySet()) { double count = innerMap.get(word2); if (!numberOfBigramsWithCount.containsKey(count+1)) { numberOfBigramsWithCount.put(count+1, 0.0); } // c* = (c+1) * N(c+1) / N(c) double newCount = (count + 1)*(numberOfBigramsWithCount.get(count+1.0))/(numberOfBigramsWithCount.get(count)); innerMap.put(word2, newCount); unigramCount += newCount; } unigramCounts.put(word1, unigramCount); } goodTuringCountsAvailable = true; } public void showCounts() { for (String word1 : counts.keySet()) { for (String word2 : counts.get(word1).keySet()) { System.out.println(word1 + " " + word2 + ": " + counts.get(word1).get(word2)); } } } public String getSentence() { String sentence = ""; String currentWord = START; String nextWord = START; //creates a sentence until a period is found //(400 is jic it doesn't find a period) while (!currentWord.equals(".") && sentence.length() <= 400) { Set<String> keySet = counts.get(currentWord).keySet(); // rand is like a random dart thrown onto a dart board // multiplied by totalCount for precision (since P(word) is small) double rand = Math.random() * unigramCounts.get(currentWord); Iterator<String> i = keySet.iterator(); //looking at all the words to see where the dart lands while (i.hasNext() && rand >= 0) { nextWord = i.next(); rand -= (double) counts.get(currentWord).get(nextWord); } currentWord = nextWord; sentence += nextWord + " "; } return sentence; } public double perplexity(Set<String> testSamples) { float product = 1; int wordCount = 0; Stack<Double> products = new Stack<Double>(); String regexp = "('?\w+|\p{Punct})"; Pattern pattern = Pattern.compile(regexp); // counting number of words in test set for (String sample : testSamples) { Matcher matcher = pattern.matcher(sample); String previousWord = START; while (matcher.find()) { String match = matcher.group(); products.push(goodTuringSmoothedProbability(previousWord, match)); wordCount++; // Update previousWord previousWord = match; } } // computing the necessary exponent double power = 1.0 / wordCount; // computing perplexity based on probabilities while (!products.empty()) { product *= Math.pow(products.pop(), power); } return 1 / product; } }