WIP

2025-01-26 13:25:58 -05:00 · 2025-01-26 13:25:58 -05:00 · 248340badb
commit 248340badb
parent 8c03c0d4bc
1 changed files with 131 additions and 108 deletions
--- a/scrape.py
+++ b/scrape.py
@ -1,154 +1,177 @@
 """
 title: Scrape Recursively
 author: Mark Bailey
-author_url: <url>
+author_url: https://git.markbailey.dev
 git_url: https://git.markbailey.dev/cerbervs/scrape.git
 description: Scrapes a website recursively using requests and BeautifulSoup.
-version: 0.1.0
+version: 1.0.0
 licence: MIT
 """
 import json
 import requests
 from urllib.parse import urlparse as parse_url
 from bs4 import BeautifulSoup
 from pydantic import BaseModel, ConfigDict
-class Tools(BaseModel):
+class Tools:
-    citation: bool = True
+    req_limit: int = 100
    visited: list = []
    netloc: str | None = None
    scheme: str | None = None
    data: dict = {}
    req_limit: int = 5
-    model_config = ConfigDict(arbitrary_types_allowed=True)
+    class RecursiveScraper(BaseModel):
        citation: bool = True
        visited: list = []
        netloc: str | None = None
        scheme: str | None = None
        data: dict = {}
        req_limit: int = 5
-    def __init__(self, req_limit: int = 5):
+        model_config = ConfigDict(arbitrary_types_allowed=True)
        super().__init__()
        self.req_limit = req_limit
-    def scrape_website(
+        def __init__(self, req_limit: int = 5):
-        self,
+            super().__init__()
-        url: str,
+            self.req_limit = req_limit
    ) -> dict:
        """
        Scrapes data from a web page using requests and BeautifulSoup.
        :params url: The URL of the web page to be scraped. Required.
        """
-        if self.netloc is None:
+        def scrape_website(
-            self.netloc = parse_url(url).netloc
+            self,
-            self.scheme = parse_url(url).scheme
+            url: str,
        ) -> dict:
            if self.netloc is None:
                self.netloc = parse_url(url).netloc
                self.scheme = parse_url(url).scheme
-        if self.req_limit == 0:
+            if self.req_limit == 0:
-            return self.data
+                return self.data
-        try:
+            try:
-            # Clean the URL
+                # Clean the URL
-            cleaned_url = self.clean_link(url)
+                cleaned_url = self.clean_link(url)
-            if cleaned_url is None:
+                if cleaned_url is None:
-                raise Exception("Invalid URL: " + url)
+                    raise Exception("Invalid URL: " + url)
-            if cleaned_url in self.visited:
+                if cleaned_url in self.visited:
                    return {}
                # Send GET request
                response = requests.get(cleaned_url)
                if not response.ok:
                    self.visited.append(cleaned_url)
                    raise Exception("Failed to fetch URL: " + cleaned_url)
                # Parse HTML content using BeautifulSoup and lxml parser
                soup = BeautifulSoup(response.text, "lxml")
                data = self.extract_data(soup)
                # Mark URL as visited
                self.visited.append(url)
                self.data.update({url: data})
                self.req_limit -= 1
                # Scrape all links in the page
                for link in data["links"]:
                    self.scrape_website(link)
                return self.data
            except Exception:
                return {}
-            # Send GET request
+        def clean_link(self, link) -> str | None:
-            response = requests.get(cleaned_url)
+            parsed_link = parse_url(link)
-            if not response.ok:
+            netloc = parsed_link.netloc
-                self.visited.append(cleaned_url)
+            path = parsed_link.path
-                raise Exception("Failed to fetch URL: " + cleaned_url)
+            fragment = parsed_link.fragment
-            # Parse HTML content using BeautifulSoup and lxml parser
+            if netloc is not None and netloc != "" and netloc != self.netloc:
-            soup = BeautifulSoup(response.text, "lxml")
+                return None
            data = self.extract_data(soup)
-            # Mark URL as visited
+            if path is not None and path != "":
-            self.visited.append(url)
+                if path.endswith("/"):
-            self.data.update({url: data})
+                    path = path[:-1]
-            self.req_limit -= 1
+                if not path.startswith("/"):
                    path = "/" + path
-            # Scrape all links in the page
+            if fragment is not None or fragment != "":
-            for link in data["links"]:
+                if parsed_link.fragment.endswith("/"):
-                self.scrape_website(link)
+                    link = link[:-1]
                if not fragment.startswith("/"):
                    fragment = "/" + fragment
-            return self.data
+            if self.netloc is None or self.scheme is None:
                return None
-        except Exception as e:
+            link = self.netloc + path + fragment
-            print(e)
+            link = self.scheme + "://" + link.replace("//", "/")
            return {}
-    def clean_link(self, link) -> str | None:
+            return link
        parsed_link = parse_url(link)
        netloc = parsed_link.netloc
        path = parsed_link.path
        fragment = parsed_link.fragment
-        if netloc is not None and netloc != "" and netloc != self.netloc:
+        def extract_data(self, soup: BeautifulSoup) -> dict:
-            return None
+            # Extract data
            data = {
                "title": soup.title.string if soup.title else None,
                "headings": [],
                "paragraphs": [],
                "links": [],
                "images": [],
            }
-        if path is not None and path != "":
+            # Find all headings (h1 to h6)
-            if path.endswith("/"):
+            heading_tag = ["h1", "h2", "h3", "h4", "h5", "h6"]
-                path = path[:-1]
+            headings = soup.find_all(heading_tag)
-            if not path.startswith("/"):
+            if headings:
-                path = "/" + path
+                data["headings"].extend([tag.get_text() for tag in headings])
-        if fragment is not None or fragment != "":
+            # Find all paragraphs
-            if parsed_link.fragment.endswith("/"):
+            paragraphs = soup.find_all("p")
-                link = link[:-1]
+            if paragraphs:
-            if not fragment.startswith("/"):
+                data["paragraphs"] = [p.get_text() for p in paragraphs]
                fragment = "/" + fragment
-        if self.netloc is None or self.scheme is None:
+            divs = soup.find_all("div")
-            return None
+            if divs:
                data["divs"] = [div.get_text() for div in divs]
-        link = self.netloc + path + fragment
+            lis = soup.find_all("li")
-        link = self.scheme + "://" + link.replace("//", "/")
+            if lis:
                data["lis"] = [li.get_text() for li in lis]
-        return link
+            # Extract all links
            links = soup.find_all("a", href=True)
            if links:
                data["links"] = [link["href"] for link in links if link["href"]]
-    def extract_data(self, soup: BeautifulSoup) -> dict:
+            # Extract image sources
-        # Extract data
+            images = soup.find_all("img", src=True)
-        data = {
+            if images:
-            "title": soup.title.string if soup.title else None,
+                data["images"] = [img["src"] for img in images if img["src"]]
            "headings": [],
            "paragraphs": [],
            "links": [],
            "images": [],
        }
-        # Find all headings (h1 to h6)
+            return data
        heading_tag = ["h1", "h2", "h3", "h4", "h5", "h6"]
        headings = soup.find_all(heading_tag)
        if headings:
            data["headings"].extend([tag.get_text() for tag in headings])
-        # Find all paragraphs
+    def __init__(self, req_limit: int = 10):
-        paragraphs = soup.find_all("p")
+        """
-        if paragraphs:
+        Initializes the Tools class.
-            data["paragraphs"] = [p.get_text() for p in paragraphs]
+        :params req_limit: The number of requests to be made to scrape the website.
        """
-        divs = soup.find_all("div")
+        self.citation: bool = True
-        if divs:
+        self.req_limit = req_limit
-            data["divs"] = [div.get_text() for div in divs]
+        self.tool = self.RecursiveScraper(req_limit=req_limit)
-        lis = soup.find_all("li")
+    def scrape_recursively(self, url: str) -> str:
-        if lis:
+        """
-            data["lis"] = [li.get_text() for li in lis]
+        Scrapes data from a web page using requests and BeautifulSoup.
        :params url: The URL of the web page to be scraped.
        """
-        # Extract all links
+        data = self.tool.scrape_website(url)
        links = soup.find_all("a", href=True)
        if links:
            data["links"] = [link["href"] for link in links if link["href"]]
-        # Extract image sources
+        self.tool.visited = []
-        images = soup.find_all("img", src=True)
+        self.tool.data = {}
-        if images:
+        self.tool.netloc = None
-            data["images"] = [img["src"] for img in images if img["src"]]
+        self.tool.scheme = None
-        return data
+        return json.dumps(data)
 if __name__ == "__main__":
-    url = "https://docs.openwebui.com"
+    url = "https://pkg.go.dev/github.com/go-chi/chi/v5"
-    print(Tools(req_limit=10).scrape_website(url))
+    print(Tools(req_limit=10).scrape_recursively(url))