Academic Integrity: tutoring, explanations, and feedback — we don’t complete graded work or submit on a student’s behalf.

Task- Implement a program which, given a user ID, a model (TF, DF, TF-IDF), and

ID: 3747589 • Letter: T

Question

Task- Implement a program which, given a user ID, a model (TF, DF, TF-IDF), and value “k”, returns the most similar k users based on textual descriptors. For each match, also list the overall matching score as well as the 3 terms that have the highest similarity contribution.

For this task i was given a devset and told to store it in (either MYSQL or MongoDB or in a file/data structure) .i stored it in a file. devset consists (user id, "term", TF,DF,TF-IDF) values in this format- 10117222@N04 "1" 1 53 0.018867924528301886 "100" 1 16 0.0625 "101" 1 4 0.25 "108" 1 3 0.3333333333333333 "214" 1 1 1.0 "216" 1 1 1.0 "220" 1 3 0.3333333333333333. There are many rows. i was told to use the tf values of the terms which are common in both rows. if there are no common terms,then the similarity measure between them is zero.

For this i need the code in python in ubuntu. subject is multimedia and web databases. its a bit urgent. Can i get it ASAP. Thank you so much in advance

Explanation / Answer

import nltk
import math
from os import walk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.corpus import stopwords
import os

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
extras = "~ ` ! @ # $ % ^ & * ( ) - _ = + { [ } ] | : ; ' < , > . ? /"
extras_list = extras.split(" ")
dict = {}
query_dict = {}
q_tf = {}
q_idf = {}
cosine_dict = {}
cosine_list = []
document = []
#-------------------------------------------------------

    #------------------------------------------------------------
#add to dictionary
def addtodict(list,k):
      for i in list:
       if i in dict:
           if k in dict[i][-1]:
               #print(k)
               dict[i][-1][1]=dict[i][-1][1]+1
           else:
              dict[i] = dict[i]+[[k,1]]
       else:
          dict[i]=[[k,1]]
        

# ------------------------------------------------
#tokenize query
def tokenize_query(s):
      global extras_list
      query1 = word_tokenize(s)
      query = [lemmatizer.lemmatize(w.lower()) for w in query1 if not w in extras_list]
      #print(query)
      return query

#---------------------------------------------------------

def find_intersection(lower_list,query):
        intersection=[]
        for i in lower_list:
            #print(i)
            if i in query:
                if i in intersection:
                    continue
                else:
                    intersection.append(i)        
            else:
                continue
        #print(intersection)    
        return intersection

   #-------------------------------------------------------------
def document_tfidf(document,intersection,k):
          weight=[]
          for i in intersection:
              x=len(dict[i])
              for j in range(0,x):
                  if(dict[i][j][0]==k):
                      weight.append(1+math.log10(dict[i][j][1]))
        
          for i in document:
              if i in intersection:
                  continue
              else:
                  x=len(dict[i])
                  for j in range(0,x):
                      if(dict[i][j][0]==k):
                          weight.append(1+math.log10(dict[i][j][1]))
          #weighted values before normalization
          #print(weight)
        
          n_value = normalise(weight)
          n=0
          for k in weight:
              weight[n]=k/n_value
              n=n+1                    
          return weight

   #--------------------------------------------------------------
def minimize_doc(document):
      listed = []
      for i in document:
          if i in listed:
              continue
          else:
              listed.append(i)
      return listed  

   #------------------------------------------------------------

def normalise(normal):
      sum = 0
      for k in normal:
          sum = sum + (k*k)
      value = math.sqrt(sum)
      return float(value)


#--------------------------------------------------
# for a query


def add_query(query):
      for i in query:
          if i in query_dict:
              query_dict[i]=query_dict[i]+1
          else:
              query_dict[i]=1

#---------------------------------------------------------------                                   
#---------------------------------------------------------------


def query_tf(query_listed):
      for i in query_listed:
          q_tf[i] = 1+math.log10(query_dict[i])
        

#---------------------------------------------------------------

def query_idf(query_listed):
      for i in query_listed:
          if i not in dict:
              q_idf[i] = 0
          else:
              q_idf[i] = math.log10((doc_count/len(dict[i])))

#---------------------------------------------------------------


#heap sort
def heapify(arr, n, i):
      largest = i
      l = 2 * i + 1  
      r = 2 * i + 2   

      if l < n and arr[i] < arr[l]:
          largest = l
        
      if r < n and arr[largest] < arr[r]:
          largest = r

      if largest != i:
          arr[i],arr[largest] = arr[largest],arr[i]
          heapify(arr, n, largest)

def heapSort(arr):
      n = len(arr)
      for i in range(n, -1, -1):
          heapify(arr, n, i)

      for i in range(n-1,-1, -1):
          arr[i], arr[0] = arr[0], arr[i]
          heapify(arr, i, 0)
  

#---------------------------------------------------------------

f=[]
#print(query_listed)
#print(query_dict)
#print("tf of query")

#-----------------------------------------------------

for (dirpath,dirnames,filenames) in walk('G:/Users/avina/Desktop/lol'):
      f.extend(filenames)
    
      doc_count = len(f)
for k in f:
         print(k)
         fo = open(k,"r+",encoding="utf8")
        
         data = fo.read()

         list = word_tokenize(data)
         lower_list = [lemmatizer.lemmatize(w.lower()) for w in list if not w in extras_list]
       
         addtodict(lower_list,k)
         fo.close()

#-------------------------------------------------------------------------------
s = e1.get() # INPUT HERE<-------------------------------S-------------------------------------->   
query = tokenize_query(s)
add_query(query)
query_listed = minimize_doc(query)
query_tf(query_listed)

#-------------------------------------------------------------------------------
query_idf(query_listed)
#idf of query
#print(q_idf)

tf_idf = {}

for i in q_tf:
    
      tf_idf[i] = (q_tf[i]*q_idf[i])

#tf idf value before normalization  
#print(tf_idf)

sum = 0
for i in tf_idf:
      sum = sum+tf_idf[i]*tf_idf[i]

#normalized value
value = math.sqrt(sum)
#print(value)

for i in tf_idf:
        if value==0:
          print("No documents found")
        else:
           tf_idf[i]=tf_idf[i]/value
      

#----------------------------------------------
#normalized tf idf value
#print(tf_idf)

# tf_idf dictionary contains the tf_idf values of query
#------------------------------------------------------------
#-------------------------------------------------------------

for i in query_listed:
      if i in dict:
          local_len = len(dict[i])
          for j in range(0,local_len):
              file_name = dict[i][j][0]
              if file_name in document:
                  continue
              else:
                  document.append(dict[i][j][0])
      else:
          continue

#print(document)  
#-------------------------------------------------------------

for k in document:
         fo = open(k,"r+",encoding="utf8")
        # print(k)
         data = fo.read()

         list = word_tokenize(data)
         lower_list = [lemmatizer.lemmatize(w.lower()) for w in list if not w in extras_list]
    
         fo.close()
  
         listed = minimize_doc(lower_list)
         intersection = find_intersection(lower_list,query)
# print("intersection values")
# print(intersection)

       
         weight = document_tfidf(listed,intersection,k)
#       print("weighted values of document after normalization")
#       print(weight)

         total = []
         ins_len = len(intersection)
         l=0
         cosine = 0
         for i in intersection:
             total.append(weight[l]*tf_idf[i])
             cosine = cosine+total[l]
             l = l+1
   #      tf idf of common values between document and query  
   #      print(total)

   #      total cosine value
   #      print(cosine)
         cosine_list.append(cosine)
         cosine_dict[cosine] = k

#print(cosine_dict)     

n = len(cosine_list)
heapSort(cosine_list)
#print(cosine_list)
count=0
# print("First Name: %s Last Name: %s" % (e1.get(), e2.get()))
ll = []
k = input("enter the value of k:")
for i in range(n-1,-1,-1):
     if count>k:
         break
     else:
      f1=cosine_list[i]
      ll.append(cosine_dict[f1])
      count = count+1

Hire Me For All Your Tutoring Needs
Integrity-first tutoring: clear explanations, guidance, and feedback.
Drop an Email at
drjack9650@gmail.com
Chat Now And Get Quote