The geturls() script, cleaned up
"""
Created on Tue Mar 9 09:57:10 2021
update 2011-03-11
@author: rmontant
"""
import os, sys
import urllib.request
def process_urllist(l):
resultsfile = l[0]
with open(resultsfile, 'w', encoding='utf-8') as h:
for url in l[1:]:
if len(url) == 0:
continue
try:
rh = urllib.request.urlopen(url)
except Exception as e:
print(url, 'failed to open:\n', e)
continue
contents = rh.read()
for decoder in ['ascii', 'utf-8', 'latin-1', 'cp1252']:
try:
contents = contents.decode(decoder)
print(url, 'used', decoder)
except:
pass
else:
print(url, file=h)
print(contents, file=h)
print(file=h)
pass
def main(argv=[__name__]):
print("you are in{}".format(os.getcwd()))
if len(argv) >= 2:
spec_dir = argv[1]
else:
spec_dir = input('What directory? ')
if len(spec_dir) == 0:
return 0
try:
spec_files = os.listdir(spec_dir)
except:
print('Bad directory name')
return 1
all_urls = {}
for urlfile in spec_files:
filepath = os.path.join(spec_dir, urlfile)
print('opening {:s}'.format(filepath))
with open(filepath, 'r') as h:
urls = [line.strip() for line in h.readlines()]
all_urls[urlfile] = urls
for key, value in all_urls.items():
process_urllist(value)
print('main() is done.')
if __name__ == '__main__':
sys.exit( main(sys.argv) )