I NEED HELP WITH THE FOLLOWING CODED, below is the problem and the coded We will
ID: 3846892 • Letter: I
Question
I NEED HELP WITH THE FOLLOWING CODED, below is the problem and the coded
We will use a full day worth of tweets as an input (there are total of 4.4M tweets in this file, but you only need to read 1M) http://rasinsrv07.cstcis.cti.depaul.edu/CSC455/OneDayOfTweets.txt
Repeat what you did in part-b, but instead of saving tweets to the file, populate the 3-table schema that you created in SQLite. Be sure to execute commit and verify that the data has been successfully loaded (report loaded row counts for each of the 3 tables) How long did this step take?
import urllib.request, time, json, sqlite3
conn = sqlite3.connect('Tweets_Database_final.db')
c = conn.cursor()
c.execute('DROP TABLE IF EXISTS Tweets');
c.execute('DROP TABLE IF EXISTS USER');
# Create Table Tweets
c.execute('''CREATE TABLE tweet
created_at DATETIME,
user_id INT,
id_str TEXT,
text TEXT,
source TEXT,
in_reply_to_user_id INT,
in_reply_to_screen_name TEXT,
in_reply_to_status_id INT,
retweet_count INT,
contributors TEXT
CONSTRAINT tweet_FK
FOREIGN KEY (user_id) REFERENCES user(id),
# Create Table User
c.execute('''CREATE TABLE user
id INT,
screen_name TEXT,
description TEXT,
friends_count INT,
contributors TEXT
CONSTRAINT USER_PK
PRIMARY KEY(ID)
# Create Table User
c.execute('''CREATE TABLE Geo
user_id INT,
type TEXT,
longitude INT,
latitude INT,
CONSTRAINT tweet_FK_2
FOREIGN KEY (user_id) REFERENCES tweet(user_id)
Line = webFD.readline()
tweetsdata= (Line.decode('utf32')).split('EndOfTweet')
errors = open("final_errors.txt", "w")
Datatweet = []
Datauser = []
DataGeo = []
for Line in range(500000):
try:
tweetrecord= json.loads(Line)
datatweet.append((tweetrecord["created_at"], tweetrecord["user_id "], tweetrecord["id_str"], tweetrecord["text"],tweetrecord["source"], tweetrecord["in_reply_to_user_id"],tweetrecord["in_reply_to_screen_name"], tweetrecord["in_reply_to_status_id"],tweetrecord['retweet_count'],tweetrecord['contributors']))
if tweetrecord[key] in ['',[],'null']:
datatweet.append(None)
else:
datatweet.append(tweetrecord[key])
c.execute('INSERT INTO tweet (created_at, user_id, id_str, text, source, in_reply_to_user_id, in_reply_to_screen_name, in_reply_to_status_id,retweet_count, contributors) VALUES (?,?,?,?,?,?,?,?,?,?)',datatweet)
datauser.append((tweetrecord["id"], tweetrecord["screen_name "], tweetrecord["desription"], tweetrecord["source"], tweetrecord["friends_count"]))
if tweetrecord[key] in ['',[],'null']:
datauser.append(None)
else:
datauser.append(tweetrecord[key])
c.executemany('INSERT INTO User (id, screen_name, description, source, friends_count) VALUES (?,?,?,?,?)', datauser)
dataGeo.append((tweetrecord["id_str"], tweetrecord["type "], tweetrecord["longitude"], tweetrecord["source"], tweetrecord["longitude"]))
if tweetrecord[key] in ['',[],'null']:
dataGeo.append(None)
else:
dataGeo.append(tweetrecord[key])
count += 1
c.executemany('INSERT INTO Geo (id_str, type, longitude, source, longitude ) VALUES (?,?,?,?,?)', dataGeo)
except ValueError:
print(tweet)
errors.write(tweet)
print("tweets loaded to file: ", count)
end = time.time()
print ("Difference is ", round((end-start),3), 'seconds')
print ("Performance : ", round(100000/(end-start), 3), ' operations per second ')
print("tweetfinal.txt on desktop")
Explanation / Answer
Hi, this is a simple script to load data into the db tables from files, let me explain step by step.
For any script to load data, we first drop the existing tables, create them and then add data into it.
import urllib.request, time, json, sqlite3
conn = sqlite3.connect('Tweets_Database_final.db')
c = conn.cursor()
c.execute('DROP TABLE IF EXISTS Tweets');
c.execute('DROP TABLE IF EXISTS USER');
the above part is for that, we simply open a cursor and execute SQl statements by c.execute(sql)
# Create Table Tweets
c.execute('''CREATE TABLE tweet created_at DATETIME,user_id INT,id_str TEXT,text TEXT,source TEXT,in_reply_to_user_id INT,in_reply_to_screen_name TEXT,in_reply_to_status_id INT,retweet_count INT,contributors TEXT,CONSTRAINT tweet_FK FOREIGN KEY (user_id) REFERENCES user(id))
/* look at this now, this creates table tweet with foreign key reference to id column of user table, and for this reason, user table should be created before tweet table */
# Create Table User
c.execute('''CREATE TABLE user id INT,screen_name TEXT,description TEXT,friends_count INT,contributors TEXT, CONSTRAINT USER_PK PRIMARY KEY(ID))
/* this created a table user with id as primary key, please write this statement above the tweet table creation */
# Create Table Geo
c.execute('''CREATE TABLE Geo user_id INT,type TEXT,longitude INT,atitude INT,CONSTRAINT tweet_FK_2
FOREIGN KEY (user_id) REFERENCES tweet(user_id) ) // i believe there was a ( missing
/* this creates a table geo with foreign key reference to user_id from tweet table */
therefore order of creation should be User, Tweet and then geo table.
You are not loading the file at all in the first place in the above code, to do that you need to add this at the start
Finally the main logic,
/* you were getting only one line, you need to change it to readlines() to get all */
Line = webFD.readlines() // reading line by line from the text file
tweetsdata= (Line.decode('utf32')).split('EndOfTweet') // setting character format and EOL
errors = open("final_errors.txt", "w") // to write errors
Datatweet = []
Datauser = []
DataGeo = []
for Line in range(500000): // this is running for 5M, you mentioned only 1M is required, you can change it to 100000
try:
tweetrecord= json.loads(Line) // load each line
datatweet.append((tweetrecord["created_at"], tweetrecord["user_id "], tweetrecord["id_str"], tweetrecord["text"],tweetrecord["source"], tweetrecord["in_reply_to_user_id"],tweetrecord["in_reply_to_screen_name"], tweetrecord["in_reply_to_status_id"],tweetrecord['retweet_count'],tweetrecord['contributors']))
if tweetrecord[key] in ['',[],'null']:
datatweet.append(None)
else:
datatweet.append(tweetrecord[key])
c.execute('INSERT INTO tweet (created_at, user_id, id_str, text, source, in_reply_to_user_id, in_reply_to_screen_name, in_reply_to_status_id,retweet_count, contributors) VALUES (?,?,?,?,?,?,?,?,?,?)',datatweet) // populating data into tweet table
datauser.append((tweetrecord["id"], tweetrecord["screen_name "], tweetrecord["desription"], tweetrecord["source"], tweetrecord["friends_count"]))
if tweetrecord[key] in ['',[],'null']:
datauser.append(None)
else:
datauser.append(tweetrecord[key])
c.executemany('INSERT INTO User (id, screen_name, description, source, friends_count) VALUES (?,?,?,?,?)', datauser) // data into user table
dataGeo.append((tweetrecord["id_str"], tweetrecord["type "], tweetrecord["longitude"], tweetrecord["source"], tweetrecord["longitude"]))
if tweetrecord[key] in ['',[],'null']:
dataGeo.append(None)
else:
dataGeo.append(tweetrecord[key])
count += 1
c.executemany('INSERT INTO Geo (id_str, type, longitude, source, longitude ) VALUES (?,?,?,?,?)', dataGeo) // data into Geo table
except ValueError:
print(tweet)
errors.write(tweet)
/* Like mentioned earlier you need to change the insert statements also to match the table creation statements otherwise you will get foreign key violation errors, so first insert into User then into Tweet and lastly into Geo */
print("tweets loaded to file: ", count)
end = time.time()
print ("Difference is ", round((end-start),3), 'seconds')
print ("Performance : ", round(100000/(end-start), 3), ' operations per second ')
print("tweetfinal.txt on desktop")
Hope this was clear, if yes give a thumbs up, if not let me know in comments.
Good Day.
Related Questions
Navigate
Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.