# This is Poogle a simple web crawler/indexer used while # teaching CS112. # # This code was created by Dan Fleck # at George Mason University # import urllib, sgmllib, shelve from WebParser import MyParser def indexer(url, words): '''Stores the url and associated words into the shelf (that is presumed open already)''' def search(term, indexShelf): '''Print a list of URLs that have the specified search term in them''' foundURLs = set() # Create an empty set # Searching for the words in words shelf term = term.lower() keys = indexShelf.keys() # Returns a list of all keys on the shelf #print keys for url in keys: #print 'URL is:',url words = indexShelf[url] #print "-----", words for phrase in words: # Look for the term in the phrase` phrase = phrase.lower() returnVal = phrase.find(term) if returnVal > -1: #print 'URL last char is %s ' %(url[-1]) url = url.rstrip() url = url.rstrip('/') foundURLs.add(url) for url in foundURLs: print 'Found in : [%s]' %(url) def crawl(url, depth, indexShelf, indexedPages): '''Crawls a URL, indexing each URL if they haven't already been indexed''' if url in indexedPages: return indexedPages.add(url) # Add to the set so we don't index again print 'Crawling URL:', url # Read the page and find the hyperlinks # Get something to work with. try: f = urllib.urlopen(url) except IOError, details: print 'IOERROR: ',details return s = f.read() # Try and process the page. # The class should have been defined first, remember. myparser = MyParser() try: myparser.parse(s) except sgmllib.SGMLParseError, details: print 'PARSER ERROR:', details return # Get the hyperlinks. hyperlinks = set(myparser.get_hyperlinks()) words = set(myparser.get_descriptions()) # Read all the words and index them indexShelf[url] = words # Open all the hyperlinks if depth != 0: for link in hyperlinks: crawl(link, depth-1, indexShelf, indexedPages) # The main program def main(): indexShelf = shelve.open('late_data', writeback=True) # Open the shelf try: indexedPages = indexShelf['indexedPages'] except KeyError, err: indexedPages = set() # Create empty set indexShelf['indexedPages'] = indexedPages opt = 'x' # Initialize it to something so we enter the while loop while opt != 'q': print ''' ********************************************************************* Welcome to Poogle! A stupid implementation of a Python Search Engine ********************************************************************* Main Menu (s)earch (c)rawl (q)uit ''' opt = raw_input('What would you like to do? ') if opt == 's': term = raw_input('What are you looking for?') search(term, indexShelf) elif opt == 'c': try: crawl('http://cs.gmu.edu/faculty/', 2, indexShelf, indexedPages) except KeyboardInterrupt, keyInt: print 'Ctrl-C! Stopping...', keyInt elif opt == 'q': pass else: print '\n\nUnknown option. Try again! \n\n' indexShelf.close() main()