Academic Integrity: tutoring, explanations, and feedback — we don’t complete graded work or submit on a student’s behalf.

package index; import java.io.IOException; import java.io.Reader; import java.ut

ID: 3710518 • Letter: P

Question

package index;

import java.io.IOException;

import java.io.Reader;

import java.util.List;

import java.util.Set;

import comparators.TfIdfComparator;

import documents.DocumentId;

/**

* A simplified document indexer and search engine.

*

* Documents are added to the engine one-by-one, and uniquely identified by a DocumentId.

*

* Documents are internally represented as "terms", which are lowercased versions of each word

* in the document.

*

* Queries for terms are also made on the lowercased version of the term. Terms are

* therefore case-insensitive.

*

* Lookups for documents can be done by term, and the most relevant document(s) to a specific term

* (as computed by tf-idf) can also be retrieved.

*

* See:

* - <https://en.wikipedia.org/wiki/Inverted_index>

* - <https://en.wikipedia.org/wiki/Search_engine_(computing)>

* - <https://en.wikipedia.org/wiki/Tf%E2%80%93idf>

*

* @author Marc Liberatore

*

*/

public class SearchEngine {

  

  

   /**

   * Inserts a document into the search engine for later analysis and retrieval.

   *

   * The document is uniquely identified by a documentId; attempts to re-insert the same

   * document are ignored.

   *

   * The document is supplied as a Reader; this method stores the document contents for

   * later analysis and retrieval.

   *

   * @param documentId

   * @param reader

   * @throws IOException iff the reader throws an exception

   */

   public void addDocument(DocumentId documentId, Reader reader) throws IOException {

   }

  

   /**

   * Returns the set of DocumentIds contained within the search engine that contain a given term.

   *

   * @param term

   * @return the set of DocumentIds that contain a given term

   */

   public Set<DocumentId> indexLookup(String term) {

       return null;

   }

  

   /**

   * Returns the term frequency of a term in a particular document.

   *

   * The term frequency is number of times the term appears in a document.

   *

   * See

   * @param documentId

   * @param term

   * @return the term frequency of a term in a particular document

   * @throws IllegalArgumentException if the documentId has not been added to the engine

   */

   public int termFrequency(DocumentId documentId, String term) throws IllegalArgumentException {

       return 0;

   }

  

   /**

   * Returns the inverse document frequency of a term across all documents in the index.

   *

   * For our purposes, IDF is defined as log ((1 + N) / (1 + M)) where

   * N is the number of documents in total, and M

   * is the number of documents where the term appears.

   *

   * @param term

   * @return the inverse document frequency of term

   */

   public double inverseDocumentFrequency(String term) {

       // first calculate N, the number of documents plus one

       // loop through all of the documents to calculate M

       // finally, calculate and return log ((1 + N) / (1 + M))

       // use Math.log to compute the logarithm

       return 0;

   }

  

   /**

   * Returns the tfidf score of a particular term for a particular document.

   *

   * tfidf is the product of term frequency and inverse document frequency for the given term and document.

   *

   * @param documentId

   * @param term

   * @return the tfidf of the the term/document

   * @throws IllegalArgumentException if the documentId has not been added to the engine

   */

   public double tfIdf(DocumentId documentId, String term) throws IllegalArgumentException {

       return 0.0;

   }

  

   /**

   * Returns a sorted list of documents, most relevant to least relevant, for the given term.

   *

   * A document with a larger tfidf score is more relevant than a document with a lower tfidf score.

   *

   * Each document in the returned list must contain the term.

   *

   * @param term

   * @param max the maximum number of documents to return (you may return fewer)

   * @return a list of documents sorted in descending order by tfidf

   */

   public List<DocumentId> relevanceLookup(String term, int max) {

       return null;

   }

}

Explanation / Answer

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;


import comparators.TfIdfComparator;
import documents.DocumentId;

/**
* A simplified document indexer and search engine.
*
* Documents are added to the engine one-by-one, and uniquely identified by a DocumentId.
*
* Documents are internally represented as "terms", which are lowercased versions of each word
* in the document.
*
* Queries for terms are also made on the lowercased version of the term. Terms are
* therefore case-insensitive.
*
* Lookups for documents can be done by term, and the most relevant document(s) to a specific term
* (as computed by tf-idf) can also be retrieved.
*
*
*/
public class SearchEngine {

    public Map<String, DocumentId> google = new HashMap<String, DocumentId>();
    /**
     * Inserts a document into the search engine for later analysis and retrieval.
     *
     * The document is uniquely identified by a documentId; attempts to re-insert the same
     * document are ignored.
     *
     * The document is supplied as a Reader; this method stores the document contents for
     * later analysis and retrieval.
     *
     * @param documentId
     * @param reader
     * @throws IOException iff the reader throws an exception
     */

    public void addDocument(DocumentId documentId, Reader reader) throws IOException {

        String Document = "";

        int x = reader.read();
        while(x != -1){
            Document += (char)x;
            x = reader.read();
        }

        Document = Document.toLowerCase();

        DocumentId p = google.get(documentId);
        if(p ==null){
            google.put(Document, documentId);
        }

    }

    /**
     * Returns the set of DocumentIds contained within the search engine that contain a given term.
     *
     * @param term
     * @return the set of DocumentIds that contain a given term
     */
    public Set<DocumentId> indexLookup(String term) {

        Set<DocumentId> t = new HashSet<DocumentId>();
        String k = term.toLowerCase();

        for(String doc: google.keySet()){
            if(doc.contains(k)){
                t.add(google.get(doc));
            }
        }

        return t;
    }

    /**
     * Returns the term frequency of a term in a particular document.
     *
     * The term frequency is number of times the term appears in a document.
     *
     * See
     * @param documentId
     * @param term
     * @return the term frequency of a term in a particular document
     * @throws IllegalArgumentException if the documentId has not been added to the engine
     */
    public int termFrequency(DocumentId documentId, String term) throws IllegalArgumentException {
        if(!google.containsValue(documentId)){
            throw new IllegalArgumentException();
        }

        term.toLowerCase();
        String Document = "";

        for(String i: google.keySet()){
            if(google.get(i).equals(documentId)){
                Document = i;
                break;
            }
        }

        int count = 0;
        int i = Document.indexOf(term);

        while(i !=-1){
            count++;
            Document = Document.substring(i+term.length());
            i = Document.indexOf(term);
        }

        return count;

       for(i = Document.indexOf(term); i<Integer.MAX_VALUE; i++){
           if(i!=-1){
               count++;
               Document = Document.substring(i + 1);
               i = Document.indexOf(term);
           }

           else{
           break;
           }
       }


    }

    /**
     * Returns the inverse document frequency of a term across all documents in the index.
     *
     * For our purposes, IDF is defined as log ((1 + N) / (1 + M)) where
     * N is the number of documents in total, and M
     * is the number of documents where the term appears.
     *
     * @param term
     * @return the inverse document frequency of term
     */
    public double inverseDocumentFrequency(String term) {


        double googlesize = google.size(), mTotal = 0;
        term.toLowerCase();

        for(String Document: google.keySet()){
            if(Document.indexOf(term) != -1){
                mTotal++;
            }
        }

        double value = (double)Math.log((double)(1+googlesize) / (double)(1+ mTotal));
        return (double)value;
    }

    /**
     * Returns the tfidf score of a particular term for a particular document.
     *
     * tfidf is the product of term frequency and inverse document frequency for the given term and document.
     *
     * @param documentId
     * @param term
     * @return the tfidf of the the term/document
     * @throws IllegalArgumentException if the documentId has not been added to the engine
     */
    public double tfIdf(DocumentId documentId, String term) throws IllegalArgumentException {

        if(!google.containsValue(documentId)){
            throw new IllegalArgumentException();
        }

        return termFrequency(documentId, term) * inverseDocumentFrequency(term);
    }

    /**
     * Returns a sorted list of documents, most relevant to least relevant, for the given term.
     *
     * A document with a larger tfidf score is more relevant than a document with a lower tfidf score.
     *
     * Each document in the returned list must contain the term.
     *
     * @param term
     * @return a list of documents sorted in descending order by tfidf
     */
    public List<DocumentId> relevanceLookup(String term) {
        List<DocumentId> Documents = new ArrayList<DocumentId>();
        Set<DocumentId> dc = indexLookup(term);
        TfIdfComparator comparekek = new TfIdfComparator(this, term);

        for(DocumentId h: dc){
            Documents.add(h);
            for(int i = 0; i< Documents.size(); i++){
               if(comparekek.compare(h, Documents.get(i)) <= 0){
                   Documents.add(i, h);
                   break;
               }

               if(i == Documents.size()-1)
                   Documents.add(h);
           if(Documents.size() == 0){
               Documents.add(h);
           }
       }
        }

        Documents.sort(comparekek);

        return Documents;
    }


}