From 8c03c0d4bcc1f887ef1ab31b31dff2cbec23f95b Mon Sep 17 00:00:00 2001 From: Mark Bailey Date: Sun, 26 Jan 2025 10:54:02 -0500 Subject: [PATCH] WIP: * error about __signature__ for Tool being class only --- scrape.py | 60 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 39 insertions(+), 21 deletions(-) diff --git a/scrape.py b/scrape.py index 5ba4671..30be55b 100644 --- a/scrape.py +++ b/scrape.py @@ -1,18 +1,32 @@ +""" +title: Scrape Recursively +author: Mark Bailey +author_url: +git_url: https://git.markbailey.dev/cerbervs/scrape.git +description: Scrapes a website recursively using requests and BeautifulSoup. +version: 0.1.0 +licence: MIT +""" + import requests from urllib.parse import urlparse as parse_url from bs4 import BeautifulSoup +from pydantic import BaseModel, ConfigDict -class Tools: - def __init__(self): - """Initialize the Tool.""" - self.citation = True - self.visited = [] - self.req_limit = 5 - self.netloc = None - self.scheme = None - self.visited = [] - self.data = {} +class Tools(BaseModel): + citation: bool = True + visited: list = [] + netloc: str | None = None + scheme: str | None = None + data: dict = {} + req_limit: int = 5 + + model_config = ConfigDict(arbitrary_types_allowed=True) + + def __init__(self, req_limit: int = 5): + super().__init__() + self.req_limit = req_limit def scrape_website( self, @@ -31,34 +45,38 @@ class Tools: return self.data try: - # Send GET request + # Clean the URL cleaned_url = self.clean_link(url) if cleaned_url is None: - raise Exception("Invalid URL") + raise Exception("Invalid URL: " + url) + if cleaned_url in self.visited: + return {} + + # Send GET request response = requests.get(cleaned_url) if not response.ok: - raise Exception("Failed to fetch URL") + self.visited.append(cleaned_url) + raise Exception("Failed to fetch URL: " + cleaned_url) # Parse HTML content using BeautifulSoup and lxml parser soup = BeautifulSoup(response.text, "lxml") data = self.extract_data(soup) - self.req_limit -= 1 + # Mark URL as visited self.visited.append(url) - self.data.update({url: data}) + self.req_limit -= 1 + # Scrape all links in the page for link in data["links"]: - if link in self.visited: - continue - self.scrape_website(link) return self.data except Exception as e: - raise e + print(e) + return {} def clean_link(self, link) -> str | None: parsed_link = parse_url(link) @@ -132,5 +150,5 @@ class Tools: if __name__ == "__main__": - url = "https://docs.openwebui.com/features/plugin/" - print(Tools().scrape_website(url)) + url = "https://docs.openwebui.com" + print(Tools(req_limit=10).scrape_website(url))