Task- Implement a program which, given a user ID, a model (TF, DF, TF-IDF), and
ID: 3747589 • Letter: T
Question
Task- Implement a program which, given a user ID, a model (TF, DF, TF-IDF), and value “k”, returns the most similar k users based on textual descriptors. For each match, also list the overall matching score as well as the 3 terms that have the highest similarity contribution.
For this task i was given a devset and told to store it in (either MYSQL or MongoDB or in a file/data structure) .i stored it in a file. devset consists (user id, "term", TF,DF,TF-IDF) values in this format- 10117222@N04 "1" 1 53 0.018867924528301886 "100" 1 16 0.0625 "101" 1 4 0.25 "108" 1 3 0.3333333333333333 "214" 1 1 1.0 "216" 1 1 1.0 "220" 1 3 0.3333333333333333. There are many rows. i was told to use the tf values of the terms which are common in both rows. if there are no common terms,then the similarity measure between them is zero.
For this i need the code in python in ubuntu. subject is multimedia and web databases. its a bit urgent. Can i get it ASAP. Thank you so much in advance
Explanation / Answer
import nltk
import math
from os import walk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.corpus import stopwords
import os
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
extras = "~ ` ! @ # $ % ^ & * ( ) - _ = + { [ } ] | : ; ' < , > . ? /"
extras_list = extras.split(" ")
dict = {}
query_dict = {}
q_tf = {}
q_idf = {}
cosine_dict = {}
cosine_list = []
document = []
#-------------------------------------------------------
#------------------------------------------------------------
#add to dictionary
def addtodict(list,k):
for i in list:
if i in dict:
if k in dict[i][-1]:
#print(k)
dict[i][-1][1]=dict[i][-1][1]+1
else:
dict[i] = dict[i]+[[k,1]]
else:
dict[i]=[[k,1]]
# ------------------------------------------------
#tokenize query
def tokenize_query(s):
global extras_list
query1 = word_tokenize(s)
query = [lemmatizer.lemmatize(w.lower()) for w in query1 if not w in extras_list]
#print(query)
return query
#---------------------------------------------------------
def find_intersection(lower_list,query):
intersection=[]
for i in lower_list:
#print(i)
if i in query:
if i in intersection:
continue
else:
intersection.append(i)
else:
continue
#print(intersection)
return intersection
#-------------------------------------------------------------
def document_tfidf(document,intersection,k):
weight=[]
for i in intersection:
x=len(dict[i])
for j in range(0,x):
if(dict[i][j][0]==k):
weight.append(1+math.log10(dict[i][j][1]))
for i in document:
if i in intersection:
continue
else:
x=len(dict[i])
for j in range(0,x):
if(dict[i][j][0]==k):
weight.append(1+math.log10(dict[i][j][1]))
#weighted values before normalization
#print(weight)
n_value = normalise(weight)
n=0
for k in weight:
weight[n]=k/n_value
n=n+1
return weight
#--------------------------------------------------------------
def minimize_doc(document):
listed = []
for i in document:
if i in listed:
continue
else:
listed.append(i)
return listed
#------------------------------------------------------------
def normalise(normal):
sum = 0
for k in normal:
sum = sum + (k*k)
value = math.sqrt(sum)
return float(value)
#--------------------------------------------------
# for a query
def add_query(query):
for i in query:
if i in query_dict:
query_dict[i]=query_dict[i]+1
else:
query_dict[i]=1
#---------------------------------------------------------------
#---------------------------------------------------------------
def query_tf(query_listed):
for i in query_listed:
q_tf[i] = 1+math.log10(query_dict[i])
#---------------------------------------------------------------
def query_idf(query_listed):
for i in query_listed:
if i not in dict:
q_idf[i] = 0
else:
q_idf[i] = math.log10((doc_count/len(dict[i])))
#---------------------------------------------------------------
#heap sort
def heapify(arr, n, i):
largest = i
l = 2 * i + 1
r = 2 * i + 2
if l < n and arr[i] < arr[l]:
largest = l
if r < n and arr[largest] < arr[r]:
largest = r
if largest != i:
arr[i],arr[largest] = arr[largest],arr[i]
heapify(arr, n, largest)
def heapSort(arr):
n = len(arr)
for i in range(n, -1, -1):
heapify(arr, n, i)
for i in range(n-1,-1, -1):
arr[i], arr[0] = arr[0], arr[i]
heapify(arr, i, 0)
#---------------------------------------------------------------
f=[]
#print(query_listed)
#print(query_dict)
#print("tf of query")
#-----------------------------------------------------
for (dirpath,dirnames,filenames) in walk('G:/Users/avina/Desktop/lol'):
f.extend(filenames)
doc_count = len(f)
for k in f:
print(k)
fo = open(k,"r+",encoding="utf8")
data = fo.read()
list = word_tokenize(data)
lower_list = [lemmatizer.lemmatize(w.lower()) for w in list if not w in extras_list]
addtodict(lower_list,k)
fo.close()
#-------------------------------------------------------------------------------
s = e1.get() # INPUT HERE<-------------------------------S-------------------------------------->
query = tokenize_query(s)
add_query(query)
query_listed = minimize_doc(query)
query_tf(query_listed)
#-------------------------------------------------------------------------------
query_idf(query_listed)
#idf of query
#print(q_idf)
tf_idf = {}
for i in q_tf:
tf_idf[i] = (q_tf[i]*q_idf[i])
#tf idf value before normalization
#print(tf_idf)
sum = 0
for i in tf_idf:
sum = sum+tf_idf[i]*tf_idf[i]
#normalized value
value = math.sqrt(sum)
#print(value)
for i in tf_idf:
if value==0:
print("No documents found")
else:
tf_idf[i]=tf_idf[i]/value
#----------------------------------------------
#normalized tf idf value
#print(tf_idf)
# tf_idf dictionary contains the tf_idf values of query
#------------------------------------------------------------
#-------------------------------------------------------------
for i in query_listed:
if i in dict:
local_len = len(dict[i])
for j in range(0,local_len):
file_name = dict[i][j][0]
if file_name in document:
continue
else:
document.append(dict[i][j][0])
else:
continue
#print(document)
#-------------------------------------------------------------
for k in document:
fo = open(k,"r+",encoding="utf8")
# print(k)
data = fo.read()
list = word_tokenize(data)
lower_list = [lemmatizer.lemmatize(w.lower()) for w in list if not w in extras_list]
fo.close()
listed = minimize_doc(lower_list)
intersection = find_intersection(lower_list,query)
# print("intersection values")
# print(intersection)
weight = document_tfidf(listed,intersection,k)
# print("weighted values of document after normalization")
# print(weight)
total = []
ins_len = len(intersection)
l=0
cosine = 0
for i in intersection:
total.append(weight[l]*tf_idf[i])
cosine = cosine+total[l]
l = l+1
# tf idf of common values between document and query
# print(total)
# total cosine value
# print(cosine)
cosine_list.append(cosine)
cosine_dict[cosine] = k
#print(cosine_dict)
n = len(cosine_list)
heapSort(cosine_list)
#print(cosine_list)
count=0
# print("First Name: %s Last Name: %s" % (e1.get(), e2.get()))
ll = []
k = input("enter the value of k:")
for i in range(n-1,-1,-1):
if count>k:
break
else:
f1=cosine_list[i]
ll.append(cosine_dict[f1])
count = count+1
Related Questions
drjack9650@gmail.com
Navigate
Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.