import sys # Importing the sys module for system-specific parameters and functions
from threading import Thread # Importing the Thread class from the threading module
import time # Importing the time module for time-related functions
import urllib.request # Importing the urllib.request module for making HTTP requests
from urllib.parse import urlparse # Importing the urlparse function from urllib.parse module for URL parsing
import os # Importing the os module for operating system-related functions
import re # Importing the re module for regular expressions
class get_webpage:
def __init__(self, user_agent):
self.agent = user_agent
self.construct_header()
def construct_header(self):
self.header = {}
self.header['User-Agent'] = self.agent
print("\nUser-agent used: " + str(self.header) + "\n") # Printing the user-agent being used
def make_request(self, webpage):
req = urllib.request.Request(webpage, headers=self.header) # Creating a request object with the provided headers
html = urllib.request.urlopen(req).read() # Sending the request and reading the response
open(self.filename_to_write(webpage), "w").write(html.decode()) # Writing the HTML content to a file
print("Webpage saved to: " + str(os.getcwd()) + "\\" + self.filename_to_write(webpage)) # Printing the path where the webpage is saved
def filename_to_write(self, webpage):
filename = urlparse(webpage)[1] # Extracting the domain name from the webpage URL
filename = re.sub('[^a-zA-Z0-9]', '', filename) # Removing non-alphanumeric characters from the filename
return filename + ".txt" # Appending ".txt" extension to the filename
class load_site_list():
def __init__(self):
self.websitelist = None
def get_website_list(self):
try:
file = input("Enter filename of website addresses: ") # Prompting the user to enter the filename of website addresses
self.websitelist = open(file, "r").read().split("\n") # Reading the file and storing the website addresses in a list
except FileNotFoundError:
print("*** file not found ***") # Printing an error message if the file is not found
finally:
pass
def main():
f = load_site_list() # Creating an instance of the load_site_list class
f.get_website_list() # Calling the get_website_list method to get the website addresses
req = get_webpage("Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:48.0) Gecko/20100101 Firefox/48.0") # Creating an instance of the get_webpage class with a user-agent
for i in range(0, len(f.websitelist)):
print("Starting thread to download: " + f.websitelist[i]) # Printing the website address for which a thread is being started
try:
t = Thread(target=req.make_request, args=(f.websitelist[i],)) # Creating a new thread to download the webpage
t.start() # Starting the thread
t.join() # Waiting for the thread to complete its execution
except Exception as err:
print("Error creating threads: " + str(err)) # Printing an error message if there is an exception while creating threads
finally:
pass
if __name__ == '__main__':
start = time.time() # Recording the start time
main() # Calling the main function
end = time.time() # Recording the end time
#print("\nPage written to " + str(os.getcwd()) + "\page.txt")
print("Time taken: {:.6f} seconds".format(end - start)) # Printing the time taken to complete the execution