WIP:

* error about __signature__ for Tool being class only
2025-01-26 10:54:02 -05:00 · 2025-01-26 10:54:02 -05:00 · 8c03c0d4bc
commit 8c03c0d4bc
parent 251adec7bc
1 changed files with 39 additions and 21 deletions
--- a/scrape.py
+++ b/scrape.py
@ -1,18 +1,32 @@
+"""
+title: Scrape Recursively
+author: Mark Bailey
+author_url: <url>
+git_url: https://git.markbailey.dev/cerbervs/scrape.git
+description: Scrapes a website recursively using requests and BeautifulSoup.
+version: 0.1.0
+licence: MIT
+"""
+
 import requests
 from urllib.parse import urlparse as parse_url
 from bs4 import BeautifulSoup
+from pydantic import BaseModel, ConfigDict


-class Tools:
-    def __init__(self):
-        """Initialize the Tool."""
-        self.citation = True
-        self.visited = []
-        self.req_limit = 5
-        self.netloc = None
-        self.scheme = None
-        self.visited = []
-        self.data = {}
+class Tools(BaseModel):
+    citation: bool = True
+    visited: list = []
+    netloc: str | None = None
+    scheme: str | None = None
+    data: dict = {}
+    req_limit: int = 5
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    def __init__(self, req_limit: int = 5):
+        super().__init__()
+        self.req_limit = req_limit

    def scrape_website(
        self,
@ -31,34 +45,38 @@ class Tools:
            return self.data

        try:
-            # Send GET request
+            # Clean the URL
            cleaned_url = self.clean_link(url)
            if cleaned_url is None:
-                raise Exception("Invalid URL")
+                raise Exception("Invalid URL: " + url)

+            if cleaned_url in self.visited:
+                return {}
+
+            # Send GET request
            response = requests.get(cleaned_url)
            if not response.ok:
-                raise Exception("Failed to fetch URL")
+                self.visited.append(cleaned_url)
+                raise Exception("Failed to fetch URL: " + cleaned_url)

            # Parse HTML content using BeautifulSoup and lxml parser
            soup = BeautifulSoup(response.text, "lxml")
            data = self.extract_data(soup)

-            self.req_limit -= 1
+            # Mark URL as visited
            self.visited.append(url)
-
            self.data.update({url: data})
+            self.req_limit -= 1

+            # Scrape all links in the page
            for link in data["links"]:
-                if link in self.visited:
-                    continue
-
                self.scrape_website(link)

            return self.data

        except Exception as e:
-            raise e
+            print(e)
+            return {}

    def clean_link(self, link) -> str | None:
        parsed_link = parse_url(link)
@ -132,5 +150,5 @@ class Tools:


 if __name__ == "__main__":
-    url = "https://docs.openwebui.com/features/plugin/"
-    print(Tools().scrape_website(url))
+    url = "https://docs.openwebui.com"
+    print(Tools(req_limit=10).scrape_website(url))