From 94166a7258231650c46defc9b6511f4c63d74324 Mon Sep 17 00:00:00 2001
From: Mark Bailey <mbbailey96@gmail.com>
Date: Sun, 26 Jan 2025 17:23:22 -0500
Subject: [PATCH] feat: v2.1.4

* fix: url's being trashed
---
 scrape.py | 62 +++++++++++++++++++++++++++++++------------------------
 1 file changed, 35 insertions(+), 27 deletions(-)
diff --git a/scrape.py b/scrape.py
index 4b54a06..afe648b 100644
--- a/scrape.py
+++ b/scrape.py
@@ -5,7 +5,7 @@ author_url: https://git.markbailey.dev/cerbervs
 git_url: https://git.markbailey.dev/cerbervs/scrape.git
 description: Scrapes web with option for recursive scraping.
 requirements: websocket, requests, bs4, pydantic
-version: 2.1.3
+version: 2.1.4
 licence: MIT
 """
 
@@ -26,9 +26,6 @@ class Tools:
         data: dict = {}
         model_config = ConfigDict(arbitrary_types_allowed=True)
 
-        def set_req_limit(self, req_limit: int):
-            self.req_limit = req_limit
-
         def scrape_website(
             self,
             url: str,
@@ -75,8 +72,8 @@ class Tools:
                         del data[key]
 
                 # Mark URL as visited
-                self.visited.append(url)
-                self.data.update({url: data})
+                self.visited.append(cleaned_url)
+                self.data.update({cleaned_url: data})
                 self.req_limit -= 1
 
                 # Scrape all links in the page
@@ -85,37 +82,46 @@ class Tools:
 
                 return self.data
 
-            except Exception:
+            except Exception as e:
+                print(f"Error when trying URL: {e}")
                 return {}
 
         def clean_link(self, link) -> str | None:
             parsed_link = parse_url(link)
+            """<scheme>://<netloc>/<path>;<params>?<query>#<fragment>"""
+            scheme = parsed_link.scheme
             netloc = parsed_link.netloc
             path = parsed_link.path
+            params = parsed_link.params
+            query = parsed_link.query
             fragment = parsed_link.fragment
 
-            if netloc is not None and netloc != "" and netloc != self.netloc:
-                return None
-
-            if path is not None and path != "":
-                if path.endswith("/"):
-                    path = path[:-1]
-                if not path.startswith("/"):
-                    path = "/" + path
-
-            if fragment is not None or fragment != "":
-                if parsed_link.fragment.endswith("/"):
-                    link = link[:-1]
-                if not fragment.startswith("/"):
-                    fragment = "/" + fragment
-
             if self.netloc is None or self.scheme is None:
                 return None
 
-            link = self.netloc + path + fragment
-            link = self.scheme + "://" + link.replace("//", "/")
+            if netloc is not None and netloc != "" and netloc != self.netloc:
+                return None
 
-            return link
+            # clean each part of the URL and then reconstruct it
+            if scheme is None or scheme == "":
+                scheme = self.scheme
+
+            if netloc is None or netloc == "":
+                netloc = self.netloc
+
+            if path is not None and path != "":
+                path = "/"+path
+
+            if params is not None and params != "":
+                params = ";" + params
+
+            if query is not None and query != "":
+                query = "?" + query
+
+            if fragment is not None and fragment != "":
+                fragment = "#" + fragment
+
+            return f"{scheme}://{netloc}{path}{params}{query}{fragment}"
 
         def extract_data(self, soup: BeautifulSoup) -> dict:
             # Extract data
@@ -164,7 +170,7 @@ class Tools:
         single_request: bool = Field(default=False, description="Single Request")
         pass
 
-    class Valves(UserValves):
+    class Valves(BaseModel):
         request_limit: int = Field(default=5, description="Request Limit")
         pass
 
@@ -186,8 +192,10 @@ class Tools:
 
         if self.user_valves.single_request:
             request_limit = 1
+            print("Single Request Mode")
         else:
             request_limit = self.valves.request_limit
+            print(f"Request Limit Mode ({self.valves.request_limit})")
 
         scraper = self.RecursiveScraper(request_limit)
         data = scraper.scrape_website(url)
@@ -197,4 +205,4 @@ class Tools:
 
 if __name__ == "__main__":
     tools = Tools()
-    print(tools.scrape_recursively("https://en.wikipedia.org/wiki/Shamisen"))
+    print(tools.scrape_recursively("https://pkg.go.dev/github.com/go-chi/chi/v5"))