WIP

2025-01-26 13:25:58 -05:00 · 2025-01-26 13:25:58 -05:00 · 248340badb
commit 248340badb
parent 8c03c0d4bc
1 changed files with 131 additions and 108 deletions
--- a/scrape.py
+++ b/scrape.py
@ -1,154 +1,177 @@
 """
 title: Scrape Recursively
 author: Mark Bailey
-author_url: <url>
+author_url: https://git.markbailey.dev
 git_url: https://git.markbailey.dev/cerbervs/scrape.git
 description: Scrapes a website recursively using requests and BeautifulSoup.
-version: 0.1.0
+version: 1.0.0
 licence: MIT
 """

+import json
 import requests
 from urllib.parse import urlparse as parse_url
 from bs4 import BeautifulSoup
 from pydantic import BaseModel, ConfigDict


-class Tools(BaseModel):
-    citation: bool = True
-    visited: list = []
-    netloc: str | None = None
-    scheme: str | None = None
-    data: dict = {}
-    req_limit: int = 5
+class Tools:
+    req_limit: int = 100

-    model_config = ConfigDict(arbitrary_types_allowed=True)
+    class RecursiveScraper(BaseModel):
+        citation: bool = True
+        visited: list = []
+        netloc: str | None = None
+        scheme: str | None = None
+        data: dict = {}
+        req_limit: int = 5

-    def __init__(self, req_limit: int = 5):
-        super().__init__()
-        self.req_limit = req_limit
+        model_config = ConfigDict(arbitrary_types_allowed=True)

-    def scrape_website(
-        self,
-        url: str,
-    ) -> dict:
-        """
-        Scrapes data from a web page using requests and BeautifulSoup.
-        :params url: The URL of the web page to be scraped. Required.
-        """
+        def __init__(self, req_limit: int = 5):
+            super().__init__()
+            self.req_limit = req_limit

-        if self.netloc is None:
-            self.netloc = parse_url(url).netloc
-            self.scheme = parse_url(url).scheme
+        def scrape_website(
+            self,
+            url: str,
+        ) -> dict:
+            if self.netloc is None:
+                self.netloc = parse_url(url).netloc
+                self.scheme = parse_url(url).scheme

-        if self.req_limit == 0:
-            return self.data
+            if self.req_limit == 0:
+                return self.data

-        try:
-            # Clean the URL
-            cleaned_url = self.clean_link(url)
-            if cleaned_url is None:
-                raise Exception("Invalid URL: " + url)
+            try:
+                # Clean the URL
+                cleaned_url = self.clean_link(url)
+                if cleaned_url is None:
+                    raise Exception("Invalid URL: " + url)

-            if cleaned_url in self.visited:
+                if cleaned_url in self.visited:
+                    return {}
+
+                # Send GET request
+                response = requests.get(cleaned_url)
+                if not response.ok:
+                    self.visited.append(cleaned_url)
+                    raise Exception("Failed to fetch URL: " + cleaned_url)
+
+                # Parse HTML content using BeautifulSoup and lxml parser
+                soup = BeautifulSoup(response.text, "lxml")
+                data = self.extract_data(soup)
+
+                # Mark URL as visited
+                self.visited.append(url)
+                self.data.update({url: data})
+                self.req_limit -= 1
+
+                # Scrape all links in the page
+                for link in data["links"]:
+                    self.scrape_website(link)
+
+                return self.data
+
+            except Exception:
                return {}

-            # Send GET request
-            response = requests.get(cleaned_url)
-            if not response.ok:
-                self.visited.append(cleaned_url)
-                raise Exception("Failed to fetch URL: " + cleaned_url)
+        def clean_link(self, link) -> str | None:
+            parsed_link = parse_url(link)
+            netloc = parsed_link.netloc
+            path = parsed_link.path
+            fragment = parsed_link.fragment

-            # Parse HTML content using BeautifulSoup and lxml parser
-            soup = BeautifulSoup(response.text, "lxml")
-            data = self.extract_data(soup)
+            if netloc is not None and netloc != "" and netloc != self.netloc:
+                return None

-            # Mark URL as visited
-            self.visited.append(url)
-            self.data.update({url: data})
-            self.req_limit -= 1
+            if path is not None and path != "":
+                if path.endswith("/"):
+                    path = path[:-1]
+                if not path.startswith("/"):
+                    path = "/" + path

-            # Scrape all links in the page
-            for link in data["links"]:
-                self.scrape_website(link)
+            if fragment is not None or fragment != "":
+                if parsed_link.fragment.endswith("/"):
+                    link = link[:-1]
+                if not fragment.startswith("/"):
+                    fragment = "/" + fragment

-            return self.data
+            if self.netloc is None or self.scheme is None:
+                return None

-        except Exception as e:
-            print(e)
-            return {}
+            link = self.netloc + path + fragment
+            link = self.scheme + "://" + link.replace("//", "/")

-    def clean_link(self, link) -> str | None:
-        parsed_link = parse_url(link)
-        netloc = parsed_link.netloc
-        path = parsed_link.path
-        fragment = parsed_link.fragment
+            return link

-        if netloc is not None and netloc != "" and netloc != self.netloc:
-            return None
+        def extract_data(self, soup: BeautifulSoup) -> dict:
+            # Extract data
+            data = {
+                "title": soup.title.string if soup.title else None,
+                "headings": [],
+                "paragraphs": [],
+                "links": [],
+                "images": [],
+            }

-        if path is not None and path != "":
-            if path.endswith("/"):
-                path = path[:-1]
-            if not path.startswith("/"):
-                path = "/" + path
+            # Find all headings (h1 to h6)
+            heading_tag = ["h1", "h2", "h3", "h4", "h5", "h6"]
+            headings = soup.find_all(heading_tag)
+            if headings:
+                data["headings"].extend([tag.get_text() for tag in headings])

-        if fragment is not None or fragment != "":
-            if parsed_link.fragment.endswith("/"):
-                link = link[:-1]
-            if not fragment.startswith("/"):
-                fragment = "/" + fragment
+            # Find all paragraphs
+            paragraphs = soup.find_all("p")
+            if paragraphs:
+                data["paragraphs"] = [p.get_text() for p in paragraphs]

-        if self.netloc is None or self.scheme is None:
-            return None
+            divs = soup.find_all("div")
+            if divs:
+                data["divs"] = [div.get_text() for div in divs]

-        link = self.netloc + path + fragment
-        link = self.scheme + "://" + link.replace("//", "/")
+            lis = soup.find_all("li")
+            if lis:
+                data["lis"] = [li.get_text() for li in lis]

-        return link
+            # Extract all links
+            links = soup.find_all("a", href=True)
+            if links:
+                data["links"] = [link["href"] for link in links if link["href"]]

-    def extract_data(self, soup: BeautifulSoup) -> dict:
-        # Extract data
-        data = {
-            "title": soup.title.string if soup.title else None,
-            "headings": [],
-            "paragraphs": [],
-            "links": [],
-            "images": [],
-        }
+            # Extract image sources
+            images = soup.find_all("img", src=True)
+            if images:
+                data["images"] = [img["src"] for img in images if img["src"]]

-        # Find all headings (h1 to h6)
-        heading_tag = ["h1", "h2", "h3", "h4", "h5", "h6"]
-        headings = soup.find_all(heading_tag)
-        if headings:
-            data["headings"].extend([tag.get_text() for tag in headings])
+            return data

-        # Find all paragraphs
-        paragraphs = soup.find_all("p")
-        if paragraphs:
-            data["paragraphs"] = [p.get_text() for p in paragraphs]
+    def __init__(self, req_limit: int = 10):
+        """
+        Initializes the Tools class.
+        :params req_limit: The number of requests to be made to scrape the website.
+        """

-        divs = soup.find_all("div")
-        if divs:
-            data["divs"] = [div.get_text() for div in divs]
+        self.citation: bool = True
+        self.req_limit = req_limit
+        self.tool = self.RecursiveScraper(req_limit=req_limit)

-        lis = soup.find_all("li")
-        if lis:
-            data["lis"] = [li.get_text() for li in lis]
+    def scrape_recursively(self, url: str) -> str:
+        """
+        Scrapes data from a web page using requests and BeautifulSoup.
+        :params url: The URL of the web page to be scraped.
+        """

-        # Extract all links
-        links = soup.find_all("a", href=True)
-        if links:
-            data["links"] = [link["href"] for link in links if link["href"]]
+        data = self.tool.scrape_website(url)

-        # Extract image sources
-        images = soup.find_all("img", src=True)
-        if images:
-            data["images"] = [img["src"] for img in images if img["src"]]
+        self.tool.visited = []
+        self.tool.data = {}
+        self.tool.netloc = None
+        self.tool.scheme = None

-        return data
+        return json.dumps(data)


 if __name__ == "__main__":
-    url = "https://docs.openwebui.com"
-    print(Tools(req_limit=10).scrape_website(url))
+    url = "https://pkg.go.dev/github.com/go-chi/chi/v5"
+    print(Tools(req_limit=10).scrape_recursively(url))