# This is Poogle a simple web crawler/indexer used while
# teaching CS112. 
#
# This code was created by Dan Fleck
# at George Mason University
#

import urllib, sgmllib, shelve
from WebParser import MyParser


def indexer(url, words):
    '''Stores the url and associated words into the shelf (that is presumed open already)'''


def search(term, indexShelf):
    '''Print a list of URLs that have the specified search term in them'''

    # Look at the shelf, go through every URL on the shelf
    # See if the search term is stored with that URL

    term = term.lower()

    foundUrls = set() # Create an empty set

    shelfKeys = indexShelf.keys()
    #print shelfKeys
    for key in shelfKeys:
        #print 'The key is', key
        words = indexShelf[key]

        if key[len(key)-1] == '/':
            key = key[:len(key)-1]
            #key = key.rstrip()
            #key = key[:-1]

        for word in words:
            #print '********** ', word
            lowerCaseWord = word.lower()
            val = lowerCaseWord.find(term)
            if val > 0:
                foundUrls.add(key)

    # Print out the URLs with the term
    for url in foundUrls:
       print 'Found in :', url


def crawl(url, depth, indexShelf, indexedPages):
    '''Crawls a URL, indexing each URL if they haven't already been indexed'''

    print 'Crawling : ', url

    # Maintain a list of URLs I have indexed
    if url in indexedPages:
        return # I have already indexed this page!
        
     
    # Add to the list of pages indexes
    indexedPages.add(url)

    # Read the webpage
    # Get something to work with.
    f = urllib.urlopen(url)
    s = f.read()
    
    # Try and process the page.
    # The class should have been defined first, remember.
    myparser = MyParser()
    try:
        myparser.parse(s)
    except sgmllib.SGMLParseError, details:
        print 'Could not parse!', details
        return
    
    # Get the hyperlinks.
    # Find hyperlinks
    # Find the words
    hyperlink = myparser.get_hyperlinks()
    hyperlink = set(hyperlink)
    words =  set(myparser.get_descriptions())

    # index the page
    indexShelf[url] = words

    # Crawl each individual hyperlink  
    if (depth > 0):
        for link in hyperlink:
            try:
                crawl(link, depth-1, indexShelf, indexedPages)
            except IOError, details:
                print 'ERROR:', details
            print 'Still going....'

 
# The main program    
def main():

    indexShelf = shelve.open('data1', writeback=True) # Opens my shelf

    indexedPages = set()

    opt = 'x' # Initialize it to something so we enter the while loop
    while opt != 'q':
        print '''
                 *********************************************************************
                 Welcome to Poogle! A stupid implementation of a Python Search Engine
                 *********************************************************************

                    Main Menu
                    (s)earch
                    (c)rawl

                    (q)uit
        '''
        opt = raw_input('What would you like to do? ')

        if opt == 's':
           term = raw_input('What are you looking for?')
           search(term, indexShelf)
        elif opt == 'c':
           try:
             crawl('http://cs.gmu.edu/faculty/', 2, indexShelf, indexedPages)
           except KeyboardInterrupt, keyInt:
             print 'Ctrl-C! Stopping...', keyInt
        elif opt == 'q':
            pass
        else:
            print '\n\nUnknown option. Try again! \n\n'

    indexShelf.close()

main()