""" title: Scrape Recursively author: Mark Bailey author_url: git_url: https://git.markbailey.dev/cerbervs/scrape.git description: Scrapes a website recursively using requests and BeautifulSoup. version: 0.1.0 licence: MIT """ import requests from urllib.parse import urlparse as parse_url from bs4 import BeautifulSoup from pydantic import BaseModel, ConfigDict class Tools(BaseModel): citation: bool = True visited: list = [] netloc: str | None = None scheme: str | None = None data: dict = {} req_limit: int = 5 model_config = ConfigDict(arbitrary_types_allowed=True) def __init__(self, req_limit: int = 5): super().__init__() self.req_limit = req_limit def scrape_website( self, url: str, ) -> dict: """ Scrapes data from a web page using requests and BeautifulSoup. :params url: The URL of the web page to be scraped. Required. """ if self.netloc is None: self.netloc = parse_url(url).netloc self.scheme = parse_url(url).scheme if self.req_limit == 0: return self.data try: # Clean the URL cleaned_url = self.clean_link(url) if cleaned_url is None: raise Exception("Invalid URL: " + url) if cleaned_url in self.visited: return {} # Send GET request response = requests.get(cleaned_url) if not response.ok: self.visited.append(cleaned_url) raise Exception("Failed to fetch URL: " + cleaned_url) # Parse HTML content using BeautifulSoup and lxml parser soup = BeautifulSoup(response.text, "lxml") data = self.extract_data(soup) # Mark URL as visited self.visited.append(url) self.data.update({url: data}) self.req_limit -= 1 # Scrape all links in the page for link in data["links"]: self.scrape_website(link) return self.data except Exception as e: print(e) return {} def clean_link(self, link) -> str | None: parsed_link = parse_url(link) netloc = parsed_link.netloc path = parsed_link.path fragment = parsed_link.fragment if netloc is not None and netloc != "" and netloc != self.netloc: return None if path is not None and path != "": if path.endswith("/"): path = path[:-1] if not path.startswith("/"): path = "/" + path if fragment is not None or fragment != "": if parsed_link.fragment.endswith("/"): link = link[:-1] if not fragment.startswith("/"): fragment = "/" + fragment if self.netloc is None or self.scheme is None: return None link = self.netloc + path + fragment link = self.scheme + "://" + link.replace("//", "/") return link def extract_data(self, soup: BeautifulSoup) -> dict: # Extract data data = { "title": soup.title.string if soup.title else None, "headings": [], "paragraphs": [], "links": [], "images": [], } # Find all headings (h1 to h6) heading_tag = ["h1", "h2", "h3", "h4", "h5", "h6"] headings = soup.find_all(heading_tag) if headings: data["headings"].extend([tag.get_text() for tag in headings]) # Find all paragraphs paragraphs = soup.find_all("p") if paragraphs: data["paragraphs"] = [p.get_text() for p in paragraphs] divs = soup.find_all("div") if divs: data["divs"] = [div.get_text() for div in divs] lis = soup.find_all("li") if lis: data["lis"] = [li.get_text() for li in lis] # Extract all links links = soup.find_all("a", href=True) if links: data["links"] = [link["href"] for link in links if link["href"]] # Extract image sources images = soup.find_all("img", src=True) if images: data["images"] = [img["src"] for img in images if img["src"]] return data if __name__ == "__main__": url = "https://docs.openwebui.com" print(Tools(req_limit=10).scrape_website(url))