WIP

2025-01-26 13:25:58 -05:00 · 2025-01-26 13:25:58 -05:00 · 248340badb
commit 248340badb
parent 8c03c0d4bc
1 changed files with 131 additions and 108 deletions
--- a/scrape.py
+++ b/scrape.py
@ -1,20 +1,24 @@
 """
 title: Scrape Recursively
 author: Mark Bailey
-author_url: <url>
+author_url: https://git.markbailey.dev
 git_url: https://git.markbailey.dev/cerbervs/scrape.git
 description: Scrapes a website recursively using requests and BeautifulSoup.
-version: 0.1.0
+version: 1.0.0
 licence: MIT
 """

+import json
 import requests
 from urllib.parse import urlparse as parse_url
 from bs4 import BeautifulSoup
 from pydantic import BaseModel, ConfigDict


-class Tools(BaseModel):
+class Tools:
+    req_limit: int = 100
+
+    class RecursiveScraper(BaseModel):
        citation: bool = True
        visited: list = []
        netloc: str | None = None
@ -32,11 +36,6 @@ class Tools(BaseModel):
            self,
            url: str,
        ) -> dict:
-        """
-        Scrapes data from a web page using requests and BeautifulSoup.
-        :params url: The URL of the web page to be scraped. Required.
-        """
-
            if self.netloc is None:
                self.netloc = parse_url(url).netloc
                self.scheme = parse_url(url).scheme
@ -74,8 +73,7 @@ class Tools(BaseModel):

                return self.data

-        except Exception as e:
-            print(e)
+            except Exception:
                return {}

        def clean_link(self, link) -> str | None:
@ -148,7 +146,32 @@ class Tools(BaseModel):

            return data

+    def __init__(self, req_limit: int = 10):
+        """
+        Initializes the Tools class.
+        :params req_limit: The number of requests to be made to scrape the website.
+        """
+
+        self.citation: bool = True
+        self.req_limit = req_limit
+        self.tool = self.RecursiveScraper(req_limit=req_limit)
+
+    def scrape_recursively(self, url: str) -> str:
+        """
+        Scrapes data from a web page using requests and BeautifulSoup.
+        :params url: The URL of the web page to be scraped.
+        """
+
+        data = self.tool.scrape_website(url)
+
+        self.tool.visited = []
+        self.tool.data = {}
+        self.tool.netloc = None
+        self.tool.scheme = None
+
+        return json.dumps(data)
+

 if __name__ == "__main__":
-    url = "https://docs.openwebui.com"
-    print(Tools(req_limit=10).scrape_website(url))
+    url = "https://pkg.go.dev/github.com/go-chi/chi/v5"
+    print(Tools(req_limit=10).scrape_recursively(url))