searchurls.py (minus the "search" part)

# -*- coding: utf-8 -*-
"""
Created on Tue Mar  9 09:57:10 2021

@author: rmontant
"""
import os, sys
import urllib.request

def read_searchterms(l):
    #termlist = []
    #for line in l[1:]:
    #    if len(line) == 0:
    #        continue
    #    else:
    #        termlist.append(line)
    termlist = [line for line in l[1:] if len(line) > 0]
    return termlist
#--------

def process_urllist(l):
    resultsfile = l[0]
    with open(resultsfile, 'w', encoding='utf-8') as h:

        for url in l[1:]:
            if len(url) == 0:   # blank line?
                continue
            try:
                rh = urllib.request.urlopen(url)
            except Exception as e:
                print(url, 'cannot urlopen()\n', e)
                continue

            contents = rh.read()
            for decoder in ['ascii', 'utf-8', 'latin-1', 'cp1252']:
                try:
                    contents = contents.decode(decoder)
                    print(url, 'used', decoder)
                except:
                    #print(url, 'is not', decoder)
                    pass
                else:
                    print(url, file=h)
                    print(contents, file=h)
                    print()
                    break
#--------

def main(argv=[__name__]):
    print("you are in {}".format(os.getcwd()))

  # Get the specifications_directory:
    if len(argv) >= 2:
        spec_dir = argv[1]
    else:
        spec_dir = input('What directory? ')
        if len(spec_dir) == 0:
            return 0    # abandon the program

  # Get a list of the specification_files that
  # are in the specification_directory:
    try:
        spec_files = os.listdir(spec_dir)
    except:
        print('Bad directory name')
        return 1

  # For each specification_file, read out a list of the
  # URLs that are in the file, and save that list in a
  # dictionary keyed by the name of the specification_file.
  #
  # One of the files is actually a file of search-terms,
  # so read a list of search_terms from that file.
    all_urls = {}
    search_terms= []
    for urlfile in spec_files:
        filepath = os.path.join(spec_dir, urlfile)
        print('opening {:s}'.format(filepath))
        with open(filepath, 'r') as h:
            lines = [line.strip() for line in h.readlines()]

        if 'SEARCH' in lines[0]:
            search_terms += read_searchterms(lines) # Get search_terms
            continue
        else:
            all_urls[urlfile] = lines   # get this file's list of URLs

    #print(all_urls)     # check that the URLs dictionary is correct
    print(search_terms) # check that the search_terms list is correct
    print()

  # The process_urllist() looks kind of like assignment "asn2"
    for key, value in all_urls.items():
        print(key)
        process_urllist(value)  # magically process the URLs

  # Still to do:
  #     Search through the contents of each URL for each of the terms
  #     in the search_terms list.

    print('main() is done.')
#--------

if __name__ == '__main__':
    sys.exit( main(sys.argv) )
#--------