CompSci 215, Advanced Python

webscraper — partial code (no regular expressions)

scraper.py — main program

#!/usr/bin/env python
# web-scraping project, first part
# 2020-02-18
import urllib.request
import ssl
import geturls

def get_url_content(url):
    response = urllib.request.urlopen( \
        url,
        context=ssl._create_unverified_context()
    )
    pageb = response.read()     # get entire page at once
    encodings = ['ascii', 'ebcdic', 'latin-1', 'utf-8', 'utf-16']
    for code in encodings:
        try:
            page = pageb.decode( code )
        except:
            print(code, "didn't work on", url)
        else:
            print(code, 'succeeded on', url)
            return page
    return None
#--------


def read_url(url):
    try:
        contents = get_url_content(url)
    except Exception as e:
        print(url, "didn't open.")
        print('Exception:', e)
        return None
    else:
       return contents
#--------


def main(argv=[__name__]):
    if len(argv) != 3:
        print('usage: {}  <requests-directory>  <terms-file>'.format(argv[0]))
        return 1

    urllist = geturls.get_URLs(argv[1])
    searchterms = argv[2]

    for urltuple in urllist:
        results = urltuple[0]
        hr = open(results, 'a')

        url = urltuple[1]
        print(url, file=hr)
        contents = read_url(url)
        #print( \
        #   'url length is {} characters.\n'.format( len(contents) ),
        #   file=hr
        #)
    hr.close()

#--------

if __name__ == '__main__':
    import sys
    sys.exit( main(sys.argv) )

geturls.py — helper module

#!/usr/bin/env python
#       function:
#               - receives specification-directory
#               - finds all files in specification directory
#               - opens and reads each file to collect urls
#               - returns list of target urls
import os

def get_URLs(spec_path):
    files = os.listdir( spec_path )
    print(files)

    urls = []
    for f in files:     # for each file in "spec_path"
        fullpath = os.path.join(spec_path, f)
        print(fullpath)
        h = open(fullpath, 'r')
        results = h.readline()           # read the first line
        results = results.strip()       # clean off leading and trailing whitespace

        for line in h.readlines():
            line = line.strip()
            if len(line) == 0:          # empty line?
                continue
            else:
                urls.append( (results, line) )

        h.close()
    return urls
#--------

def main(argv=[__name__]):
    print('{} starts with {}'.format(argv[0], argv[-1]))
    urllist = get_URLs( argv[1] )
    for url in urllist:
        print(url)
#--------

if __name__ == '__main__':
    import sys
    sys.exit(main(sys.argv))
$ python scraper.py
$