feat: v1.1.0

This commit is contained in:
Mark Bailey 2025-01-26 14:37:52 -05:00
parent ff1401761d
commit 38eebe6ec7

View File

@ -6,22 +6,19 @@ from bs4 import BeautifulSoup
from pydantic import BaseModel, ConfigDict
DEFAULT_REQUEST_LIMIT = 50
class Tools:
req_limit: int = DEFAULT_REQUEST_LIMIT
DEFAULT_REQUEST_LIMIT = 1
class Tools:
class RecursiveScraper(BaseModel):
citation: bool = True
req_limit: int
visited: list = []
netloc: str | None = None
scheme: str | None = None
data: dict = {}
req_limit: int = DEFAULT_REQUEST_LIMIT
model_config = ConfigDict(arbitrary_types_allowed=True)
def __init__(self, req_limit: int = DEFAULT_REQUEST_LIMIT):
super().__init__()
def set_req_limit(self, req_limit: int):
self.req_limit = req_limit
def scrape_website(
@ -146,31 +143,46 @@ class Tools:
return data
def __init__(self, req_limit: int = DEFAULT_REQUEST_LIMIT):
def reset(self):
self.visited = []
self.data = {}
self.netloc = None
self.scheme = None
def __init__(self, req_limit: int):
super().__init__(req_limit=req_limit)
self.reset()
def __init__(self):
"""
Initializes the Tools class.
:params req_limit: The number of requests to be made to scrape the website.
"""
self.citation: bool = True
self.req_limit = req_limit
self.tool = self.RecursiveScraper(req_limit=req_limit)
def scrape_recursively(self, url: str) -> str:
def scrape_recursively(
self, url: str, request_limit: int = DEFAULT_REQUEST_LIMIT
) -> str:
"""
Scrapes data from a web page using requests and BeautifulSoup.
:params url: The URL of the web page to be scraped.
:params request_limit: The number of requests to be made to scrape the website. (Optional, Default = 1)
"""
data = self.tool.scrape_website(url)
scraper = self.RecursiveScraper(request_limit)
data = scraper.scrape_website(url)
json_s = json.dumps(data)
self.tool.visited = []
self.tool.data = {}
self.tool.netloc = None
self.tool.scheme = None
scraper.reset()
return json_s
return json.dumps(data)
if __name__ == "__main__":
tools = Tools()
print(tools.scrape_recursively("https://pkg.go.dev/github.com/go-chi/chi/v5"))
print(
tools.scrape_recursively(
"https://www.allrecipes.com/recipe/16700/salsa-chicken/"
)
)