scrape/scrape.py

"""
title: Scrape Recursively
author: Mark Bailey
author_url: <url>
git_url: https://git.markbailey.dev/cerbervs/scrape.git
description: Scrapes a website recursively using requests and BeautifulSoup.
version: 0.1.0
licence: MIT
"""

import requests
from urllib.parse import urlparse as parse_url
from bs4 import BeautifulSoup
from pydantic import BaseModel, ConfigDict


class Tools(BaseModel):
    citation: bool = True
    visited: list = []
    netloc: str | None = None
    scheme: str | None = None
    data: dict = {}
    req_limit: int = 5

    model_config = ConfigDict(arbitrary_types_allowed=True)

    def __init__(self, req_limit: int = 5):
        super().__init__()
        self.req_limit = req_limit

    def scrape_website(
        self,
        url: str,
    ) -> dict:
        """
        Scrapes data from a web page using requests and BeautifulSoup.
        :params url: The URL of the web page to be scraped. Required.
        """

        if self.netloc is None:
            self.netloc = parse_url(url).netloc
            self.scheme = parse_url(url).scheme

        if self.req_limit == 0:
            return self.data

        try:
            # Clean the URL
            cleaned_url = self.clean_link(url)
            if cleaned_url is None:
                raise Exception("Invalid URL: " + url)

            if cleaned_url in self.visited:
                return {}

            # Send GET request
            response = requests.get(cleaned_url)
            if not response.ok:
                self.visited.append(cleaned_url)
                raise Exception("Failed to fetch URL: " + cleaned_url)

            # Parse HTML content using BeautifulSoup and lxml parser
            soup = BeautifulSoup(response.text, "lxml")
            data = self.extract_data(soup)

            # Mark URL as visited
            self.visited.append(url)
            self.data.update({url: data})
            self.req_limit -= 1

            # Scrape all links in the page
            for link in data["links"]:
                self.scrape_website(link)

            return self.data

        except Exception as e:
            print(e)
            return {}

    def clean_link(self, link) -> str | None:
        parsed_link = parse_url(link)
        netloc = parsed_link.netloc
        path = parsed_link.path
        fragment = parsed_link.fragment

        if netloc is not None and netloc != "" and netloc != self.netloc:
            return None

        if path is not None and path != "":
            if path.endswith("/"):
                path = path[:-1]
            if not path.startswith("/"):
                path = "/" + path

        if fragment is not None or fragment != "":
            if parsed_link.fragment.endswith("/"):
                link = link[:-1]
            if not fragment.startswith("/"):
                fragment = "/" + fragment

        if self.netloc is None or self.scheme is None:
            return None

        link = self.netloc + path + fragment
        link = self.scheme + "://" + link.replace("//", "/")

        return link

    def extract_data(self, soup: BeautifulSoup) -> dict:
        # Extract data
        data = {
            "title": soup.title.string if soup.title else None,
            "headings": [],
            "paragraphs": [],
            "links": [],
            "images": [],
        }

        # Find all headings (h1 to h6)
        heading_tag = ["h1", "h2", "h3", "h4", "h5", "h6"]
        headings = soup.find_all(heading_tag)
        if headings:
            data["headings"].extend([tag.get_text() for tag in headings])

        # Find all paragraphs
        paragraphs = soup.find_all("p")
        if paragraphs:
            data["paragraphs"] = [p.get_text() for p in paragraphs]

        divs = soup.find_all("div")
        if divs:
            data["divs"] = [div.get_text() for div in divs]

        lis = soup.find_all("li")
        if lis:
            data["lis"] = [li.get_text() for li in lis]

        # Extract all links
        links = soup.find_all("a", href=True)
        if links:
            data["links"] = [link["href"] for link in links if link["href"]]

        # Extract image sources
        images = soup.find_all("img", src=True)
        if images:
            data["images"] = [img["src"] for img in images if img["src"]]

        return data


if __name__ == "__main__":
    url = "https://docs.openwebui.com"
    print(Tools(req_limit=10).scrape_website(url))