From Website to PDF using Vivaldi Browser and SingleFile extension

August 31, 2024 4 minutes read

web scraping • PDF conversion • SingleFile • Python • pandoc

Pre-Requisites:

Vivaldi Browser (Although any other browser can be used)
SingleFile extension

If using Chrome then single-file-cli should also work, but it didn’t work with Vivaldi.

Start with a collection of links in a file

link1.html
link2.html

Download complete HTML for each url

Note: Change the x,y coordinates if required

Github

#!/usr/bin/env python3
"""
A script to download web pages using Vivaldi browser and SingleFile extension

Usage:
./download-urls.py -h

./download-urls.py -v file.txt # To log INFO messages
./download-urls.py -vv file.txt # To log DEBUG messages
"""
import logging
import subprocess
import time
from argparse import ArgumentParser, RawDescriptionHelpFormatter

import pyautogui


def setup_logging(verbosity):
    logging_level = logging.WARNING
    if verbosity == 1:
        logging_level = logging.INFO
    elif verbosity >= 2:
        logging_level = logging.DEBUG

    logging.basicConfig(
        handlers=[
            logging.StreamHandler(),
        ],
        format="%(asctime)s - %(filename)s:%(lineno)d - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        level=logging_level,
    )
    logging.captureWarnings(capture=True)


def parse_args():
    parser = ArgumentParser(description=__doc__, formatter_class=RawDescriptionHelpFormatter)
    parser.add_argument(
        "file",
        help="File containing list of URLs",
    )
    parser.add_argument(
        "-v",
        "--verbose",
        action="count",
        default=0,
        dest="verbose",
        help="Increase verbosity of logging output",
    )
    return parser.parse_args()


def open_vivaldi(url):
    subprocess.Popen(["open", "-a", "Vivaldi", url])


def process_url(url):
    # Set the coordinates where you want to click
    single_file_x, single_file_y = 1493, 92
    click_x, click_y = 1489, 287

    logging.info(f"Opening Vivaldi with URL: {url}")
    open_vivaldi(url)

    # Wait for the page to load
    time.sleep(5)

    logging.debug(f"Moving to coordinates: ({single_file_x}, {single_file_y})")
    pyautogui.moveTo(single_file_x, single_file_y)

    logging.debug("Clicking at current position")
    pyautogui.click()

    logging.debug(f"Moving to coordinates: ({click_x}, {click_y})")
    pyautogui.moveTo(click_x, click_y)

    logging.info("Waiting for 15 seconds")
    time.sleep(15)


def main(args):
    # Ensure a safe exit is possible
    pyautogui.FAILSAFE = True

    # Initial delay before starting the process
    logging.info("Starting initial delay of 5 seconds")
    time.sleep(5)

    try:
        with open(args.file) as f:
            urls = f.read().splitlines()

        logging.info(f"Found {len(urls)} URLs in the file")

        for i, url in enumerate(urls, 1):
            logging.info(f"Processing URL {i} of {len(urls)}: {url}")
            process_url(url)

        logging.info("Script completed successfully.")

    except KeyboardInterrupt:
        logging.warning("Script terminated by user.")
    except Exception as e:
        logging.error(f"An error occurred: {e}")


if __name__ == "__main__":
    args = parse_args()
    setup_logging(args.verbose)
    main(args)

Clean up any unwanted tags

import os
import re

def clean_html_files(directory):
    # Pattern for iframes
    iframe_pattern = re.compile(r'<iframe.*?</iframe>', re.DOTALL)

    # Pattern for dir=ltr
    dir_ltr_pattern = re.compile(r'\s*dir=ltr\s*')

    for filename in os.listdir(directory):
        if filename.endswith('.html'):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r', encoding='utf-8') as file:
                content = file.read()

            # Remove iframes and dir=ltr
            modified_content = iframe_pattern.sub('', content)
            modified_content = dir_ltr_pattern.sub('', modified_content)

            if content != modified_content:
                with open(filepath, 'w', encoding='utf-8') as file:
                    file.write(modified_content)
                print(f"Processed: {filename}")

# Use the current directory
current_directory = os.getcwd()
clean_html_files(current_directory)

Convert individual HTML file to PDF

NOTE: Update the html_dir variable to where the files are downloaded.

#!/bin/bash

# Set the directory containing the HTML files
html_dir="$HOME/temp"
pdf_dir="$html_dir/pdf"

# Create the pdf subdirectory if it doesn't exist
mkdir -p "$pdf_dir"

# Iterate over each HTML file in the directory
for html_file in "$html_dir"/*.html; do
  # Check if there are any matching files
  if [ ! -e "$html_file" ]; then
    echo "No HTML files found in $html_dir"
    exit 1
  fi

  # Extract the filename without the extension
  filename=$(basename "$html_file" .html)
  printf "Converting %s to %s.pdf\n" "$html_file" "$filename"

  # Convert the HTML file to PDF using pandoc
  pandoc "$html_file" -o "$pdf_dir/$filename.pdf"

  # Print a message indicating the conversion is complete
  echo "Converted $html_file to $pdf_dir/$filename.pdf"
done

echo "All PDF files have been saved in $pdf_dir"

Merge into one PDF

pip install PyPDF2

import os
from PyPDF2 import PdfMerger

def merge_pdfs(input_directory, output_filename):
    merger = PdfMerger()

    # Get all PDF files in the directory
    pdf_files = [f for f in os.listdir(input_directory) if f.endswith(".pdf")]
    
    # Sort the files to ensure a consistent order
    pdf_files.sort()

    # Append each PDF file to the merger
    for filename in pdf_files:
        filepath = os.path.join(input_directory, filename)
        merger.append(filepath)

    # Write the merged PDF to the output file
    with open(output_filename, "wb") as output_file:
        merger.write(output_file)

    print(f"Merged {len(pdf_files)} PDF files into {output_filename}")

# Set up the paths
current_directory = os.getcwd()
input_directory = os.path.join(current_directory, "pdf-output")
output_file = os.path.join(current_directory, "merged_output.pdf")

# Check if the pdf-output directory exists
if not os.path.exists(input_directory):
    print(f"Error: The directory '{input_directory}' does not exist.")
else:
    merge_pdfs(input_directory, output_file)

The final copy will be in merged_output.pdf file in the current directory