feat: v2.1.4

* fix: url's being trashed
2025-01-26 17:23:22 -05:00 · 2025-01-26 17:23:22 -05:00 · 94166a7258
commit 94166a7258
parent 9c54bd2481
1 changed files with 35 additions and 27 deletions
--- a/scrape.py
+++ b/scrape.py
@ -5,7 +5,7 @@ author_url: https://git.markbailey.dev/cerbervs
 git_url: https://git.markbailey.dev/cerbervs/scrape.git
 description: Scrapes web with option for recursive scraping.
 requirements: websocket, requests, bs4, pydantic
-version: 2.1.3
+version: 2.1.4
 licence: MIT
 """

@ -26,9 +26,6 @@ class Tools:
        data: dict = {}
        model_config = ConfigDict(arbitrary_types_allowed=True)

-        def set_req_limit(self, req_limit: int):
-            self.req_limit = req_limit
-
        def scrape_website(
            self,
            url: str,
@ -75,8 +72,8 @@ class Tools:
                        del data[key]

                # Mark URL as visited
-                self.visited.append(url)
-                self.data.update({url: data})
+                self.visited.append(cleaned_url)
+                self.data.update({cleaned_url: data})
                self.req_limit -= 1

                # Scrape all links in the page
@ -85,37 +82,46 @@ class Tools:

                return self.data

-            except Exception:
+            except Exception as e:
+                print(f"Error when trying URL: {e}")
                return {}

        def clean_link(self, link) -> str | None:
            parsed_link = parse_url(link)
+            """<scheme>://<netloc>/<path>;<params>?<query>#<fragment>"""
+            scheme = parsed_link.scheme
            netloc = parsed_link.netloc
            path = parsed_link.path
+            params = parsed_link.params
+            query = parsed_link.query
            fragment = parsed_link.fragment

-            if netloc is not None and netloc != "" and netloc != self.netloc:
-                return None
-
-            if path is not None and path != "":
-                if path.endswith("/"):
-                    path = path[:-1]
-                if not path.startswith("/"):
-                    path = "/" + path
-
-            if fragment is not None or fragment != "":
-                if parsed_link.fragment.endswith("/"):
-                    link = link[:-1]
-                if not fragment.startswith("/"):
-                    fragment = "/" + fragment
-
            if self.netloc is None or self.scheme is None:
                return None

-            link = self.netloc + path + fragment
-            link = self.scheme + "://" + link.replace("//", "/")
+            if netloc is not None and netloc != "" and netloc != self.netloc:
+                return None

-            return link
+            # clean each part of the URL and then reconstruct it
+            if scheme is None or scheme == "":
+                scheme = self.scheme
+
+            if netloc is None or netloc == "":
+                netloc = self.netloc
+
+            if path is not None and path != "":
+                path = "/"+path
+
+            if params is not None and params != "":
+                params = ";" + params
+
+            if query is not None and query != "":
+                query = "?" + query
+
+            if fragment is not None and fragment != "":
+                fragment = "#" + fragment
+
+            return f"{scheme}://{netloc}{path}{params}{query}{fragment}"

        def extract_data(self, soup: BeautifulSoup) -> dict:
            # Extract data
@ -164,7 +170,7 @@ class Tools:
        single_request: bool = Field(default=False, description="Single Request")
        pass

-    class Valves(UserValves):
+    class Valves(BaseModel):
        request_limit: int = Field(default=5, description="Request Limit")
        pass

@ -186,8 +192,10 @@ class Tools:

        if self.user_valves.single_request:
            request_limit = 1
+            print("Single Request Mode")
        else:
            request_limit = self.valves.request_limit
+            print(f"Request Limit Mode ({self.valves.request_limit})")

        scraper = self.RecursiveScraper(request_limit)
        data = scraper.scrape_website(url)
@ -197,4 +205,4 @@ class Tools:

 if __name__ == "__main__":
    tools = Tools()
-    print(tools.scrape_recursively("https://en.wikipedia.org/wiki/Shamisen"))
+    print(tools.scrape_recursively("https://pkg.go.dev/github.com/go-chi/chi/v5"))