# This is Poogle a simple web crawler/indexer used while # teaching CS112. # # This code was created by Dan Fleck # at George Mason University # import urllib, sgmllib, shelve from WebParser import MyParser def indexer(url, words): '''Stores the url and associated words into the shelf (that is presumed open already)''' global linkInfo linkInfo[url] = set(words) def search(term): '''Print a list of URLs that have the specified search term in them''' tempSet = set() # EMPTY SET keys = linkInfo.keys() for url in keys: # Loop through all keys words = linkInfo[url] for word in words: if term.lower() in word.lower(): # Add them into the set tempSet.add(url) # Print them out for link in tempSet: print link def crawl(url, depth): '''Crawls a URL, indexing each URL if they haven't already been indexed''' global visitedLinks visitedLinks.add(url) print (10-depth)*'-'+url # Get the URL (sorta like a file) try: f = urllib.urlopen(url) except IOError, moreDetails: print moreDetails return # Read the entire URL into a string s = f.read() # Try and parse the page. myparser = MyParser() try: myparser.parse(s) # Index the information indexer(url, myparser.get_descriptions()) # Get the hyperlinks. hyperlinks = set(myparser.get_hyperlinks()) for link in hyperlinks: if depth > 0 and link not in visitedLinks: crawl(link, depth-1) else: print link except sgmllib.SGMLParseError, err: print 'Error parsing site: %s ' %(url), err # The main program def main(): global visitedLinks, linkInfo visitedLinks = set() # Empty set of visited links linkInfo = shelve.open('links', writeback=True) visitedLinks = set(linkInfo.keys()) opt = 'x' # Initialize it to something so we enter the while loop while opt != 'q': print ''' ********************************************************************* Welcome to Poogle! A stupid implementation of a Python Search Engine ********************************************************************* Main Menu (s)earch (c)rawl (i)ndex (q)uit ''' opt = raw_input('What would you like to do? ') if opt == 's': term = raw_input('What are you looking for?') search(term) elif opt == 'c': try: crawl('http://cs.gmu.edu/faculty/', 2) except KeyboardInterrupt, keyInt: print 'Ctrl-C! Stopping...', keyInt elif opt == 'i': indexer('http://cs.gmu.edu') elif opt == 'q': pass else: print '\n\nUnknown option. Try again! \n\n' # Make sure to close the link info list! linkInfo.close() main()