diff --git a/scrape.py b/scrape.py index dde01ab..c9fad66 100644 --- a/scrape.py +++ b/scrape.py @@ -3,7 +3,7 @@ import websocket import requests from urllib.parse import urlparse as parse_url from bs4 import BeautifulSoup -from pydantic import BaseModel, ConfigDict +from pydantic import BaseModel, ConfigDict, Field DEFAULT_REQUEST_LIMIT = 1 @@ -143,15 +143,18 @@ class Tools: return data - def reset(self): - self.visited = [] - self.data = {} - self.netloc = None - self.scheme = None - def __init__(self, req_limit: int): super().__init__(req_limit=req_limit) - self.reset() + + class Valves(BaseModel): + request_limit: int = Field( + default=5, description="Request Limit" + ) + pass + + class UserValves(BaseModel): + single_request: bool = Field(default=False, description="Single Request") + pass def __init__(self): """ @@ -160,23 +163,23 @@ class Tools: """ self.citation: bool = True + self.valves = self.Valves() + self.user_valves = self.UserValves() - def scrape_recursively( - self, url: str, request_limit: int = DEFAULT_REQUEST_LIMIT - ) -> str: + def scrape_recursively(self, url: str) -> str: """ Scrapes data from a web page using requests and BeautifulSoup. :params url: The URL of the web page to be scraped. - :params request_limit: The number of requests to be made to scrape the website. (Optional, Default = 1) """ + if self.user_valves.single_request: + request_limit = 1 + else: + request_limit = self.valves.request_limit + scraper = self.RecursiveScraper(request_limit) data = scraper.scrape_website(url) json_s = json.dumps(data) - - # if scraper.visited != [] or scraper.data != {} or scraper.netloc is not None or scraper.scheme is not None: - # scraper.reset() - scraper = None return json_s