Re-write the following code for browsing Wikipedia at random to use regular expr
ID: 664437 • Letter: R
Question
Re-write the following code for browsing Wikipedia at random to use regular expressions to find links, instead of manually breaking the tags apart. All of your links should be valid ones. Additionally, write an additional regular expression to find images on the page. Download all of the images to a folder called “wiki_pics”. You will have to create the folder manually. The downloaded images should keep their original file names. Your code should ONLY capture images.
Here is the original code:
import webbrowser, urllib, random
def linkreader(url):
web_page = urllib.urlopen(url)
lines = web_page.read()
web_page.close()
lines = lines.replace(">","<")
lines = lines.split("<")
links = []
for ent in lines:
if "href" and "/wiki" in ent and ".org" not in ent:
links.append(ent)
flinks = []
for let in links:
let = let.split('"')
flinks.append(let[1])
return flinks
start = raw_input("Where would you like to start? " )
jumps = int(raw_input("How many jumps? "))
base = "https://en.wikipedia.org"
webbrowser.open(start)
for jump in range(jumps):
print " Jumping from: "+start
dest1 = random.choice(linkreader(start)).replace('"',"")
print "To: ",base+dest1
webbrowser.open_new_tab(base+dest1)
start = base+dest1
Needs to be in Python, tested and working. Make sure you use regular expressions to find links instead of manually breaking tags apart
Explanation / Answer
import urllib, random, re, os def get_links(url): pages = [] #Get page info try: web_page = urllib.urlopen(url) except IOError: print "Not a valid webpage." else: lines = web_page.read() web_page.close() links = [item for item in re.findall('"/wiki/[w.-]+"', lines)] for link in links: link = link.replace(""", "") pages.append(link.replace('/wiki', 'http://en.wikipedia.org/wiki')) return pages def image_list(url): #Get image urls try: web_page = urllib.urlopen(url) except IOError: print "Not a valid webpage." else: lines = web_page.read() web_page.close() img_lines = [item for item in re.findall('img .*?[alt=""]? src="(.*?)"', lines)] for each in img_lines: each = each.replace("//", "http://") if ".png" in each.lower() or ".gif" in each.lower() or ".jpg" in each.lower(): image = os.path.basename(each) print "Saving " + image + " to wiki_pics/" urllib.urlretrieve(each, os.path.join(os.getcwd(), "wiki_pics", image)) #Main url = raw_input("Where would you like to start: ") while True: try: jumps = int(raw_input("How many jumps? ")) except ValueError: print "You need to enter a number." else: break for i in range(jumps): links = get_links(url) print "Jumping From: " + url image_list(url) newURL = random.choice(links) print "To: " + newURL url = newURL print
Related Questions
Navigate
Integrity-first tutoring: explanations and feedback only — we do not complete graded work. Learn more.