diff --git a/scrape.py b/scrape.py index 26ab657..89d56cf 100644 --- a/scrape.py +++ b/scrape.py @@ -6,22 +6,19 @@ from bs4 import BeautifulSoup from pydantic import BaseModel, ConfigDict -DEFAULT_REQUEST_LIMIT = 50 -class Tools: - req_limit: int = DEFAULT_REQUEST_LIMIT +DEFAULT_REQUEST_LIMIT = 1 + +class Tools: class RecursiveScraper(BaseModel): - citation: bool = True + req_limit: int visited: list = [] netloc: str | None = None scheme: str | None = None data: dict = {} - req_limit: int = DEFAULT_REQUEST_LIMIT - model_config = ConfigDict(arbitrary_types_allowed=True) - def __init__(self, req_limit: int = DEFAULT_REQUEST_LIMIT): - super().__init__() + def set_req_limit(self, req_limit: int): self.req_limit = req_limit def scrape_website( @@ -146,31 +143,46 @@ class Tools: return data - def __init__(self, req_limit: int = DEFAULT_REQUEST_LIMIT): + def reset(self): + self.visited = [] + self.data = {} + self.netloc = None + self.scheme = None + + def __init__(self, req_limit: int): + super().__init__(req_limit=req_limit) + self.reset() + + def __init__(self): """ Initializes the Tools class. :params req_limit: The number of requests to be made to scrape the website. """ self.citation: bool = True - self.req_limit = req_limit - self.tool = self.RecursiveScraper(req_limit=req_limit) - def scrape_recursively(self, url: str) -> str: + def scrape_recursively( + self, url: str, request_limit: int = DEFAULT_REQUEST_LIMIT + ) -> str: """ Scrapes data from a web page using requests and BeautifulSoup. :params url: The URL of the web page to be scraped. + :params request_limit: The number of requests to be made to scrape the website. (Optional, Default = 1) """ - data = self.tool.scrape_website(url) + scraper = self.RecursiveScraper(request_limit) + data = scraper.scrape_website(url) + json_s = json.dumps(data) - self.tool.visited = [] - self.tool.data = {} - self.tool.netloc = None - self.tool.scheme = None + scraper.reset() + + return json_s - return json.dumps(data) if __name__ == "__main__": tools = Tools() - print(tools.scrape_recursively("https://pkg.go.dev/github.com/go-chi/chi/v5")) + print( + tools.scrape_recursively( + "https://www.allrecipes.com/recipe/16700/salsa-chicken/" + ) + )