feat: initial commit

2025-01-25 17:11:13 -05:00 · 2025-01-25 17:11:13 -05:00 · 251adec7bc
commit 251adec7bc
1 changed files with 136 additions and 0 deletions
--- a/scrape.py
+++ b/scrape.py
@ -0,0 +1,136 @@
+import requests
+from urllib.parse import urlparse as parse_url
+from bs4 import BeautifulSoup
+
+
+class Tools:
+    def __init__(self):
+        """Initialize the Tool."""
+        self.citation = True
+        self.visited = []
+        self.req_limit = 5
+        self.netloc = None
+        self.scheme = None
+        self.visited = []
+        self.data = {}
+
+    def scrape_website(
+        self,
+        url: str,
+    ) -> dict:
+        """
+        Scrapes data from a web page using requests and BeautifulSoup.
+        :params url: The URL of the web page to be scraped. Required.
+        """
+
+        if self.netloc is None:
+            self.netloc = parse_url(url).netloc
+            self.scheme = parse_url(url).scheme
+
+        if self.req_limit == 0:
+            return self.data
+
+        try:
+            # Send GET request
+            cleaned_url = self.clean_link(url)
+            if cleaned_url is None:
+                raise Exception("Invalid URL")
+
+            response = requests.get(cleaned_url)
+            if not response.ok:
+                raise Exception("Failed to fetch URL")
+
+            # Parse HTML content using BeautifulSoup and lxml parser
+            soup = BeautifulSoup(response.text, "lxml")
+            data = self.extract_data(soup)
+
+            self.req_limit -= 1
+            self.visited.append(url)
+
+            self.data.update({url: data})
+
+            for link in data["links"]:
+                if link in self.visited:
+                    continue
+
+                self.scrape_website(link)
+
+            return self.data
+
+        except Exception as e:
+            raise e
+
+    def clean_link(self, link) -> str | None:
+        parsed_link = parse_url(link)
+        netloc = parsed_link.netloc
+        path = parsed_link.path
+        fragment = parsed_link.fragment
+
+        if netloc is not None and netloc != "" and netloc != self.netloc:
+            return None
+
+        if path is not None and path != "":
+            if path.endswith("/"):
+                path = path[:-1]
+            if not path.startswith("/"):
+                path = "/" + path
+
+        if fragment is not None or fragment != "":
+            if parsed_link.fragment.endswith("/"):
+                link = link[:-1]
+            if not fragment.startswith("/"):
+                fragment = "/" + fragment
+
+        if self.netloc is None or self.scheme is None:
+            return None
+
+        link = self.netloc + path + fragment
+        link = self.scheme + "://" + link.replace("//", "/")
+
+        return link
+
+    def extract_data(self, soup: BeautifulSoup) -> dict:
+        # Extract data
+        data = {
+            "title": soup.title.string if soup.title else None,
+            "headings": [],
+            "paragraphs": [],
+            "links": [],
+            "images": [],
+        }
+
+        # Find all headings (h1 to h6)
+        heading_tag = ["h1", "h2", "h3", "h4", "h5", "h6"]
+        headings = soup.find_all(heading_tag)
+        if headings:
+            data["headings"].extend([tag.get_text() for tag in headings])
+
+        # Find all paragraphs
+        paragraphs = soup.find_all("p")
+        if paragraphs:
+            data["paragraphs"] = [p.get_text() for p in paragraphs]
+
+        divs = soup.find_all("div")
+        if divs:
+            data["divs"] = [div.get_text() for div in divs]
+
+        lis = soup.find_all("li")
+        if lis:
+            data["lis"] = [li.get_text() for li in lis]
+
+        # Extract all links
+        links = soup.find_all("a", href=True)
+        if links:
+            data["links"] = [link["href"] for link in links if link["href"]]
+
+        # Extract image sources
+        images = soup.find_all("img", src=True)
+        if images:
+            data["images"] = [img["src"] for img in images if img["src"]]
+
+        return data
+
+
+if __name__ == "__main__":
+    url = "https://docs.openwebui.com/features/plugin/"
+    print(Tools().scrape_website(url))