searchurls.py (minus the "search" part)
"""
Created on Tue Mar 9 09:57:10 2021
@author: rmontant
"""
import os, sys
import urllib.request
def read_searchterms(l):
termlist = [line for line in l[1:] if len(line) > 0]
return termlist
def process_urllist(l):
resultsfile = l[0]
with open(resultsfile, 'w', encoding='utf-8') as h:
for url in l[1:]:
if len(url) == 0:
continue
try:
rh = urllib.request.urlopen(url)
except Exception as e:
print(url, 'cannot urlopen()\n', e)
continue
contents = rh.read()
for decoder in ['ascii', 'utf-8', 'latin-1', 'cp1252']:
try:
contents = contents.decode(decoder)
print(url, 'used', decoder)
except:
pass
else:
print(url, file=h)
print(contents, file=h)
print()
break
def main(argv=[__name__]):
print("you are in {}".format(os.getcwd()))
if len(argv) >= 2:
spec_dir = argv[1]
else:
spec_dir = input('What directory? ')
if len(spec_dir) == 0:
return 0
try:
spec_files = os.listdir(spec_dir)
except:
print('Bad directory name')
return 1
all_urls = {}
search_terms= []
for urlfile in spec_files:
filepath = os.path.join(spec_dir, urlfile)
print('opening {:s}'.format(filepath))
with open(filepath, 'r') as h:
lines = [line.strip() for line in h.readlines()]
if 'SEARCH' in lines[0]:
search_terms += read_searchterms(lines)
continue
else:
all_urls[urlfile] = lines
print(search_terms)
print()
for key, value in all_urls.items():
print(key)
process_urllist(value)
print('main() is done.')
if __name__ == '__main__':
sys.exit( main(sys.argv) )