Source code for saveutils

"""Module that defines a class for saving webpages on disk.

Only the HTML content of the webpage is saved on disk, thus the other resources,
such as pictures, might not get rendered when viewed on a browser.

.. _HTTP request header:
.. _List of all HTTP headers (Mozilla):
.. _List of HTTP header fields (Wikipedia):


import logging
import os
import sys
import time
from logging import NullHandler
# Third-party modules
import requests
# Custom modules
import pyutils.exceptions.connection as connec_exc
import pyutils.exceptions.files as files_exc
from pyutils.genutils import read_file, write_file
from pyutils.log.logging_wrapper import LoggingWrapper

# Setup logging

[docs]class SaveWebpages: """A class that saves webpages on disk. The HTML content of the webpages is saved on disk. Thus, other resources (such as pictures) might not get rendered when viewed on a browser. When retrieving webpages, a certain delay is introduced between HTTP requests to the server in order to reduce its workload. Parameters ---------- overwrite_webpages : bool, optional Whether a webpage that is saved on disk can be overwritten (the default value is True which implies that the webpages can be overwritten on disk). http_get_timeout : int, optional Timeout when a **GET** request doesn't receive any response from the server. After the timeout expires, the **GET** request is dropped (the default value is 5 seconds). headers : dict, optional The information added to the **HTTP GET** request that a user's browser sends to a Web server containing the details of what the browser wants and will accept back from the server. See `HTTP request header`_ (the default value is defined in :attr:`~SaveWebpages.headers`). Its keys are the request headers' field names like `Accept`, `Cookie`, `User-Agent`, or `Referer` and its values are the associated request headers' field values. *See* `List of all HTTP headers (Mozilla)`_ *and* `List of HTTP header fields (Wikipedia)`_. """ headers = {'User-Agent': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/44.0.2403.157 " "Safari/537.36c", 'Accept': "text/html,application/xhtml+xml,application/xml;q=0.9," "image/webp,*/*;q=0.8"} """ The information added to the **HTTP GET** request that a user's browser sends to a Web server containing the details of what the browser wants and will accept back from the server. """ def __init__(self, overwrite_webpages=False, http_get_timeout=5, delay_between_requests=8, headers=headers): self.overwrite_webpages = overwrite_webpages self.http_get_timeout = http_get_timeout self.delay_between_requests = delay_between_requests self.headers = headers self.logger = logging.getLogger(__name__) # Experimental option: add color to log messages if os.environ.get('COLOR_LOGS'): self.logger = LoggingWrapper(self.logger, os.environ.get('COLOR_LOGS')) # Establish a session to be used for the GET requests self.req_session = requests.Session() self.last_request_time = -sys.float_info.max
[docs] def get_cached_webpage(self, filepath): """Load a webpage from disk. Load the HTML content of a webpage from disk. The webpages are cached in order to reduce the number of requests to the server. Parameters ---------- filepath : str The file path of the webpage to load from disk. Raises ------ OSError Raised if an I/O related error occurs while reading the cached HTML document, e.g. the file doesn't exist. Returns ------- html : str HTML content of the webpage that is loaded from disk. """ try: self.logger.debug( "Reading the cached HTML file '{}'".format(filepath)) html = read_file(filepath) except OSError as e: raise OSError(e) else: self.logger.debug( "The webpage HTML was successfully loaded from '{}'".format( filepath)) return html
[docs] def save_webpage(self, filepath, url): """Save a webpage on disk. First, the webpage is checked if it's already cached. If it's found in cache, then its HTML content is simply returned. If the webpage is not found in cache, then it's retrieved from the server and saved on disk. **IMPORTANT:** the webpage found on cache might also be overwritten if the option ``overwrite_webpages`` is set to True. Parameters ---------- filepath : str File path of the webpage that will be saved on disk. url : str URL to the webpage that will be saved on disk. Raises ------ HTTP404Error Raised if the server returns a 404 status code because the webpage is not found. OverwriteFileError Raised if an existing file is being overwritten and the flag to overwrite files is disabled. OSError Raised if an I/O related error occurs while writing the webpage on disk, e.g. the file doesn't exist. Returns ------- html : str HTML content of the webpage that is saved on disk. """ try: if os.path.isfile(filepath) and not self.overwrite_webpages: html = self.get_cached_webpage(filepath) else: # Retrieve webpage html = self.get_webpage(url) self.logger.debug("Webpage retrieved!") # Write webpage locally self.logger.debug( "Saving webpage to '{}'".format(filepath)) write_file(filepath, html, self.overwrite_webpages) self.logger.debug("The webpage is saved in '{}'. URL is " "'{}'".format(filepath, url)) except (connec_exc.HTTP404Error, files_exc.OverwriteFileError, OSError) as e: # HTTP404Error from _get_webpage() # OverwriteFileError from write_file() # OSError from _get_webpage() and write_file() raise e return html
[docs] def get_webpage(self, url): """Get the HTMl content of a webpage. When retrieving the webpage, a certain delay is introduced between HTTP requests to the server in order to reduce its workload. Parameters ---------- url : str URL of the webpage whose HTML content will be retrieved. Raises ------ HTTP404Error Raised if the server returns a 404 status code because the webpage is not found. requests.RequestException Raised if there is a :mod:`requests`-related error, e.g. :exc:`requests.ConnectionError` if the URL is not known. Returns ------- html : str HTML content of the webpage that is saved on disk. """ current_delay = time.time() - self.last_request_time diff_between_delays = \ current_delay - self.delay_between_requests if diff_between_delays < 0: self.logger.debug("Waiting {} seconds before sending next HTTP " "request...".format(abs(diff_between_delays))) time.sleep(abs(diff_between_delays)) self.logger.debug("Time is up! HTTP request will be sent.") try: self.logger.debug("Sending HTTP request ...") self.last_request_time = time.time() req = self.req_session.get( url, headers=self.headers, timeout=self.http_get_timeout) html = req.text except requests.exceptions.RequestException as e: raise requests.exceptions.RequestException(e) else: if req.status_code == 404: raise connec_exc.HTTP404Error( "404: PAGE NOT FOUND. The URL '{}' returned a 404 status " "code.".format(url)) elif req.status_code == 200: self.logger.debug("200: OK. Webpage successfully retrieved!") else: self.logger.debug( "Request response: status code is {}".format(req.status_code)) return html