# This is Poogle a simple web crawler/indexer used while # teaching CS112. # # This code was created by Dan Fleck # at George Mason University # import urllib, sgmllib, shelve from WebParser import MyParser def indexer(url, words): '''Stores the url and associated words into the shelf (that is presumed open already)''' def search(term, indexShelf): '''Print a list of URLs that have the specified search term in them''' # Look at the shelf, go through every URL on the shelf # See if the search term is stored with that URL term = term.lower() foundUrls = set() # Create an empty set shelfKeys = indexShelf.keys() #print shelfKeys for key in shelfKeys: #print 'The key is', key words = indexShelf[key] if key[len(key)-1] == '/': key = key[:len(key)-1] #key = key.rstrip() #key = key[:-1] for word in words: #print '********** ', word lowerCaseWord = word.lower() val = lowerCaseWord.find(term) if val > 0: foundUrls.add(key) # Print out the URLs with the term for url in foundUrls: print 'Found in :', url def crawl(url, depth, indexShelf, indexedPages): '''Crawls a URL, indexing each URL if they haven't already been indexed''' print 'Crawling : ', url # Maintain a list of URLs I have indexed if url in indexedPages: return # I have already indexed this page! # Add to the list of pages indexes indexedPages.add(url) # Read the webpage # Get something to work with. f = urllib.urlopen(url) s = f.read() # Try and process the page. # The class should have been defined first, remember. myparser = MyParser() try: myparser.parse(s) except sgmllib.SGMLParseError, details: print 'Could not parse!', details return # Get the hyperlinks. # Find hyperlinks # Find the words hyperlink = myparser.get_hyperlinks() hyperlink = set(hyperlink) words = set(myparser.get_descriptions()) # index the page indexShelf[url] = words # Crawl each individual hyperlink if (depth > 0): for link in hyperlink: try: crawl(link, depth-1, indexShelf, indexedPages) except IOError, details: print 'ERROR:', details print 'Still going....' # The main program def main(): indexShelf = shelve.open('data1', writeback=True) # Opens my shelf indexedPages = set() opt = 'x' # Initialize it to something so we enter the while loop while opt != 'q': print ''' ********************************************************************* Welcome to Poogle! A stupid implementation of a Python Search Engine ********************************************************************* Main Menu (s)earch (c)rawl (q)uit ''' opt = raw_input('What would you like to do? ') if opt == 's': term = raw_input('What are you looking for?') search(term, indexShelf) elif opt == 'c': try: crawl('http://cs.gmu.edu/faculty/', 2, indexShelf, indexedPages) except KeyboardInterrupt, keyInt: print 'Ctrl-C! Stopping...', keyInt elif opt == 'q': pass else: print '\n\nUnknown option. Try again! \n\n' indexShelf.close() main()