# This code was downloaded from http://www.boddie.org.uk/python/HTML.html
#
# This code implements a very simple URL parser that can return LINKS and descriptions (text on the page)
#
import sgmllib

class MyParser(sgmllib.SGMLParser):
    "A simple parser class."

    def parse(self, s):
        "Parse the given string 's'."
        self.feed(s)
        self.close()

    def __init__(self, verbose=0):
        "Initialise an object, passing 'verbose' to the superclass."

        sgmllib.SGMLParser.__init__(self, verbose)
        self.hyperlinks = []
        self.descriptions = []
        self.inside_a_element = 0

    def start_a(self, attributes):
        "Process a hyperlink and its 'attributes'."

        for name, value in attributes:
            if name == "href":
                self.hyperlinks.append(value)
                self.inside_a_element = 1

    def end_a(self):
        "Record the end of a hyperlink."

        self.inside_a_element = 0

    def handle_data(self, data):
        "Handle the textual 'data'."

        if self.inside_a_element:
            self.descriptions.append(data)

    def get_hyperlinks(self):
        "Return the list of hyperlinks."

        return self.hyperlinks

    def get_descriptions(self):
        "Return a list of descriptions."

        return self.descriptions


# Fleck modified some of this
def main():

    import urllib, sgmllib

    # Get something to work with.
    f = urllib.urlopen("http://www.python.org")
    s = f.read()
    
    # Try and process the page.
    # The class should have been defined first, remember.
    myparser = MyParser()
    myparser.parse(s)
    
    # Get the hyperlinks.
    print myparser.get_hyperlinks()
    print myparser.get_descriptions()


# Only run if this is the main program
if __name__ == '__main__':
    main()