WIP:

* error about __signature__ for Tool being class only
2025-01-26 10:54:02 -05:00 · 2025-01-26 10:54:02 -05:00 · 8c03c0d4bc
commit 8c03c0d4bc
parent 251adec7bc
1 changed files with 39 additions and 21 deletions
--- a/scrape.py
+++ b/scrape.py
@ -1,18 +1,32 @@
 """
 title: Scrape Recursively
 author: Mark Bailey
 author_url: <url>
 git_url: https://git.markbailey.dev/cerbervs/scrape.git
 description: Scrapes a website recursively using requests and BeautifulSoup.
 version: 0.1.0
 licence: MIT
 """
 import requests
 from urllib.parse import urlparse as parse_url
 from bs4 import BeautifulSoup
 from pydantic import BaseModel, ConfigDict
-class Tools:
+class Tools(BaseModel):
-    def __init__(self):
+    citation: bool = True
-        """Initialize the Tool."""
+    visited: list = []
-        self.citation = True
+    netloc: str | None = None
-        self.visited = []
+    scheme: str | None = None
-        self.req_limit = 5
+    data: dict = {}
-        self.netloc = None
+    req_limit: int = 5
-        self.scheme = None
+
-        self.visited = []
+    model_config = ConfigDict(arbitrary_types_allowed=True)
-        self.data = {}
+
    def __init__(self, req_limit: int = 5):
        super().__init__()
        self.req_limit = req_limit
    def scrape_website(
        self,
@ -31,34 +45,38 @@ class Tools:
            return self.data
        try:
-            # Send GET request
+            # Clean the URL
            cleaned_url = self.clean_link(url)
            if cleaned_url is None:
-                raise Exception("Invalid URL")
+                raise Exception("Invalid URL: " + url)
            if cleaned_url in self.visited:
                return {}
            # Send GET request
            response = requests.get(cleaned_url)
            if not response.ok:
-                raise Exception("Failed to fetch URL")
+                self.visited.append(cleaned_url)
                raise Exception("Failed to fetch URL: " + cleaned_url)
            # Parse HTML content using BeautifulSoup and lxml parser
            soup = BeautifulSoup(response.text, "lxml")
            data = self.extract_data(soup)
-            self.req_limit -= 1
+            # Mark URL as visited
            self.visited.append(url)
            self.data.update({url: data})
            self.req_limit -= 1
            # Scrape all links in the page
            for link in data["links"]:
                if link in self.visited:
                    continue
                self.scrape_website(link)
            return self.data
        except Exception as e:
-            raise e
+            print(e)
            return {}
    def clean_link(self, link) -> str | None:
        parsed_link = parse_url(link)
@ -132,5 +150,5 @@ class Tools:
 if __name__ == "__main__":
-    url = "https://docs.openwebui.com/features/plugin/"
+    url = "https://docs.openwebui.com"
-    print(Tools().scrape_website(url))
+    print(Tools(req_limit=10).scrape_website(url))