diff --git a/scrape.py b/scrape.py index 30be55b..4431fc8 100644 --- a/scrape.py +++ b/scrape.py @@ -1,154 +1,177 @@ """ title: Scrape Recursively author: Mark Bailey -author_url: +author_url: https://git.markbailey.dev git_url: https://git.markbailey.dev/cerbervs/scrape.git description: Scrapes a website recursively using requests and BeautifulSoup. -version: 0.1.0 +version: 1.0.0 licence: MIT """ +import json import requests from urllib.parse import urlparse as parse_url from bs4 import BeautifulSoup from pydantic import BaseModel, ConfigDict -class Tools(BaseModel): - citation: bool = True - visited: list = [] - netloc: str | None = None - scheme: str | None = None - data: dict = {} - req_limit: int = 5 +class Tools: + req_limit: int = 100 - model_config = ConfigDict(arbitrary_types_allowed=True) + class RecursiveScraper(BaseModel): + citation: bool = True + visited: list = [] + netloc: str | None = None + scheme: str | None = None + data: dict = {} + req_limit: int = 5 - def __init__(self, req_limit: int = 5): - super().__init__() - self.req_limit = req_limit + model_config = ConfigDict(arbitrary_types_allowed=True) - def scrape_website( - self, - url: str, - ) -> dict: - """ - Scrapes data from a web page using requests and BeautifulSoup. - :params url: The URL of the web page to be scraped. Required. - """ + def __init__(self, req_limit: int = 5): + super().__init__() + self.req_limit = req_limit - if self.netloc is None: - self.netloc = parse_url(url).netloc - self.scheme = parse_url(url).scheme + def scrape_website( + self, + url: str, + ) -> dict: + if self.netloc is None: + self.netloc = parse_url(url).netloc + self.scheme = parse_url(url).scheme - if self.req_limit == 0: - return self.data + if self.req_limit == 0: + return self.data - try: - # Clean the URL - cleaned_url = self.clean_link(url) - if cleaned_url is None: - raise Exception("Invalid URL: " + url) + try: + # Clean the URL + cleaned_url = self.clean_link(url) + if cleaned_url is None: + raise Exception("Invalid URL: " + url) - if cleaned_url in self.visited: + if cleaned_url in self.visited: + return {} + + # Send GET request + response = requests.get(cleaned_url) + if not response.ok: + self.visited.append(cleaned_url) + raise Exception("Failed to fetch URL: " + cleaned_url) + + # Parse HTML content using BeautifulSoup and lxml parser + soup = BeautifulSoup(response.text, "lxml") + data = self.extract_data(soup) + + # Mark URL as visited + self.visited.append(url) + self.data.update({url: data}) + self.req_limit -= 1 + + # Scrape all links in the page + for link in data["links"]: + self.scrape_website(link) + + return self.data + + except Exception: return {} - # Send GET request - response = requests.get(cleaned_url) - if not response.ok: - self.visited.append(cleaned_url) - raise Exception("Failed to fetch URL: " + cleaned_url) + def clean_link(self, link) -> str | None: + parsed_link = parse_url(link) + netloc = parsed_link.netloc + path = parsed_link.path + fragment = parsed_link.fragment - # Parse HTML content using BeautifulSoup and lxml parser - soup = BeautifulSoup(response.text, "lxml") - data = self.extract_data(soup) + if netloc is not None and netloc != "" and netloc != self.netloc: + return None - # Mark URL as visited - self.visited.append(url) - self.data.update({url: data}) - self.req_limit -= 1 + if path is not None and path != "": + if path.endswith("/"): + path = path[:-1] + if not path.startswith("/"): + path = "/" + path - # Scrape all links in the page - for link in data["links"]: - self.scrape_website(link) + if fragment is not None or fragment != "": + if parsed_link.fragment.endswith("/"): + link = link[:-1] + if not fragment.startswith("/"): + fragment = "/" + fragment - return self.data + if self.netloc is None or self.scheme is None: + return None - except Exception as e: - print(e) - return {} + link = self.netloc + path + fragment + link = self.scheme + "://" + link.replace("//", "/") - def clean_link(self, link) -> str | None: - parsed_link = parse_url(link) - netloc = parsed_link.netloc - path = parsed_link.path - fragment = parsed_link.fragment + return link - if netloc is not None and netloc != "" and netloc != self.netloc: - return None + def extract_data(self, soup: BeautifulSoup) -> dict: + # Extract data + data = { + "title": soup.title.string if soup.title else None, + "headings": [], + "paragraphs": [], + "links": [], + "images": [], + } - if path is not None and path != "": - if path.endswith("/"): - path = path[:-1] - if not path.startswith("/"): - path = "/" + path + # Find all headings (h1 to h6) + heading_tag = ["h1", "h2", "h3", "h4", "h5", "h6"] + headings = soup.find_all(heading_tag) + if headings: + data["headings"].extend([tag.get_text() for tag in headings]) - if fragment is not None or fragment != "": - if parsed_link.fragment.endswith("/"): - link = link[:-1] - if not fragment.startswith("/"): - fragment = "/" + fragment + # Find all paragraphs + paragraphs = soup.find_all("p") + if paragraphs: + data["paragraphs"] = [p.get_text() for p in paragraphs] - if self.netloc is None or self.scheme is None: - return None + divs = soup.find_all("div") + if divs: + data["divs"] = [div.get_text() for div in divs] - link = self.netloc + path + fragment - link = self.scheme + "://" + link.replace("//", "/") + lis = soup.find_all("li") + if lis: + data["lis"] = [li.get_text() for li in lis] - return link + # Extract all links + links = soup.find_all("a", href=True) + if links: + data["links"] = [link["href"] for link in links if link["href"]] - def extract_data(self, soup: BeautifulSoup) -> dict: - # Extract data - data = { - "title": soup.title.string if soup.title else None, - "headings": [], - "paragraphs": [], - "links": [], - "images": [], - } + # Extract image sources + images = soup.find_all("img", src=True) + if images: + data["images"] = [img["src"] for img in images if img["src"]] - # Find all headings (h1 to h6) - heading_tag = ["h1", "h2", "h3", "h4", "h5", "h6"] - headings = soup.find_all(heading_tag) - if headings: - data["headings"].extend([tag.get_text() for tag in headings]) + return data - # Find all paragraphs - paragraphs = soup.find_all("p") - if paragraphs: - data["paragraphs"] = [p.get_text() for p in paragraphs] + def __init__(self, req_limit: int = 10): + """ + Initializes the Tools class. + :params req_limit: The number of requests to be made to scrape the website. + """ - divs = soup.find_all("div") - if divs: - data["divs"] = [div.get_text() for div in divs] + self.citation: bool = True + self.req_limit = req_limit + self.tool = self.RecursiveScraper(req_limit=req_limit) - lis = soup.find_all("li") - if lis: - data["lis"] = [li.get_text() for li in lis] + def scrape_recursively(self, url: str) -> str: + """ + Scrapes data from a web page using requests and BeautifulSoup. + :params url: The URL of the web page to be scraped. + """ - # Extract all links - links = soup.find_all("a", href=True) - if links: - data["links"] = [link["href"] for link in links if link["href"]] + data = self.tool.scrape_website(url) - # Extract image sources - images = soup.find_all("img", src=True) - if images: - data["images"] = [img["src"] for img in images if img["src"]] + self.tool.visited = [] + self.tool.data = {} + self.tool.netloc = None + self.tool.scheme = None - return data + return json.dumps(data) if __name__ == "__main__": - url = "https://docs.openwebui.com" - print(Tools(req_limit=10).scrape_website(url)) + url = "https://pkg.go.dev/github.com/go-chi/chi/v5" + print(Tools(req_limit=10).scrape_recursively(url))