feat: v1.1.0

2025-01-26 14:37:52 -05:00 · 2025-01-26 14:37:52 -05:00 · 38eebe6ec7
commit 38eebe6ec7
parent ff1401761d
1 changed files with 31 additions and 19 deletions
--- a/scrape.py
+++ b/scrape.py
@ -6,22 +6,19 @@ from bs4 import BeautifulSoup
 from pydantic import BaseModel, ConfigDict
-DEFAULT_REQUEST_LIMIT = 50
+DEFAULT_REQUEST_LIMIT = 1
 class Tools:
    req_limit: int = DEFAULT_REQUEST_LIMIT
 class Tools:
    class RecursiveScraper(BaseModel):
-        citation: bool = True
+        req_limit: int
        visited: list = []
        netloc: str | None = None
        scheme: str | None = None
        data: dict = {}
        req_limit: int = DEFAULT_REQUEST_LIMIT
        model_config = ConfigDict(arbitrary_types_allowed=True)
-        def __init__(self, req_limit: int = DEFAULT_REQUEST_LIMIT):
+        def set_req_limit(self, req_limit: int):
            super().__init__()
            self.req_limit = req_limit
        def scrape_website(
@ -146,31 +143,46 @@ class Tools:
            return data
-    def __init__(self, req_limit: int = DEFAULT_REQUEST_LIMIT):
+        def reset(self):
            self.visited = []
            self.data = {}
            self.netloc = None
            self.scheme = None
        def __init__(self, req_limit: int):
            super().__init__(req_limit=req_limit)
            self.reset()
    def __init__(self):
        """
        Initializes the Tools class.
        :params req_limit: The number of requests to be made to scrape the website.
        """
        self.citation: bool = True
        self.req_limit = req_limit
        self.tool = self.RecursiveScraper(req_limit=req_limit)
-    def scrape_recursively(self, url: str) -> str:
+    def scrape_recursively(
        self, url: str, request_limit: int = DEFAULT_REQUEST_LIMIT
    ) -> str:
        """
        Scrapes data from a web page using requests and BeautifulSoup.
        :params url: The URL of the web page to be scraped.
        :params request_limit: The number of requests to be made to scrape the website. (Optional, Default = 1)
        """
-        data = self.tool.scrape_website(url)
+        scraper = self.RecursiveScraper(request_limit)
        data = scraper.scrape_website(url)
        json_s = json.dumps(data)
-        self.tool.visited = []
+        scraper.reset()
-        self.tool.data = {}
+
-        self.tool.netloc = None
+        return json_s
        self.tool.scheme = None
        return json.dumps(data)
 if __name__ == "__main__":
    tools = Tools()
-    print(tools.scrape_recursively("https://pkg.go.dev/github.com/go-chi/chi/v5"))
+    print(
        tools.scrape_recursively(
            "https://www.allrecipes.com/recipe/16700/salsa-chicken/"
        )
    )