searchurls.py

# -*- coding: utf-8 -*-
"""
Created on Tue Mar  9 09:57:10 2021

@author: rmontant
"""
import os, sys
import urllib.request

def read_searchterms(lst):
    # Skip lst[0] because it just says "SEARCH"

    #termlist = []
    #for line in lst[1:]:
    #    if len(line) == 0:
    #        continue
    #    else:
    #        termlist.append(line)
    termlist = [line for line in lst[1:] if len(line) > 0]
    return termlist
#--------

def process_urllist(lst):
    # lst[0] contains the name of a destination file
    #   for results
    resultsfile = lst[0]
    results_dict = {}
    with open(resultsfile, 'w', encoding='utf-8') as h:

        for url in lst[1:]:
            if len(url) == 0:   # blank line?
                continue
            try:
                rh = urllib.request.urlopen(url)
            except Exception as e:
                print(url, 'cannot urlopen()\n', e)
                continue

            contents = rh.read()
            for decoder in ['ascii', 'utf-8', 'latin-1', 'cp1252']:
                try:
                    contents = contents.decode(decoder)
                    print(url, 'used', decoder)
                except:
                    #print(url, 'is not', decoder)
                    pass
                else:
                    print(url, file=h)
                    print(contents, file=h)
                    print(file=h)
                    break
            results_dict[url] = contents

    return results_dict
#--------

def do_searching(rdict, searchterms):
    search_dict = {}

    for term in searchterms:

        for url, contents  in  rdict.items():
            #print('URL:', url)
            search_dict[ (term, url) ] = []
            position = -1
            found = True
            while found:
                position = contents.find(term, position + 1)
                if position == -1:
                    found = False
                else:
                    #print('  ', term, position)
                    search_dict[ (term, url) ].append(position)
    return search_dict
#--------

def main(argv=[__name__]):
    print("you are in {}".format(os.getcwd()))
    if len(argv) >= 2:
        spec_dir = argv[1]
    else:
        spec_dir = input('What directory? ')
        if len(spec_dir) == 0:
            return 0    # abandon the program

    try:
        spec_files = os.listdir(spec_dir)
    except:
        print('Bad directory name')
        return 1

    #spec_dir = spec_dir.replace('/', '\\')
    #print(spec_dir)

    all_urls = {}
    search_terms= []
    for urlfile in spec_files:
        filepath = os.path.join(spec_dir, urlfile)
        print('opening {:s}'.format(filepath))
        with open(filepath, 'r') as h:
            lines = [line.strip() for line in h.readlines()]
        if 'SEARCH' in lines[0]:
            search_terms += read_searchterms(lines)
            continue
        else:
            all_urls[urlfile] = lines

    #print(all_urls)
    print(search_terms)

    #
    # What follows looks like assignment "asn2"
    #
    for k, v in all_urls.items():
        print(k)
        rdict = process_urllist(v)  # magically process the URLs

        sdict = do_searching(rdict, search_terms)
        for term_url_pair, positions in sdict.items():
            print(term_url_pair)
            print(positions)
            print()

    print('main() is done.')
#--------

if __name__ == '__main__':
    sys.exit( main(sys.argv) )
#--------