feat: v2.1.4

* fix: url's being trashed
2025-01-26 17:23:22 -05:00 · 2025-01-26 17:23:22 -05:00 · 94166a7258
commit 94166a7258
parent 9c54bd2481
1 changed files with 35 additions and 27 deletions
--- a/scrape.py
+++ b/scrape.py
@ -5,7 +5,7 @@ author_url: https://git.markbailey.dev/cerbervs
 git_url: https://git.markbailey.dev/cerbervs/scrape.git
 description: Scrapes web with option for recursive scraping.
 requirements: websocket, requests, bs4, pydantic
-version: 2.1.3
+version: 2.1.4
 licence: MIT
 """
@ -26,9 +26,6 @@ class Tools:
        data: dict = {}
        model_config = ConfigDict(arbitrary_types_allowed=True)
        def set_req_limit(self, req_limit: int):
            self.req_limit = req_limit
        def scrape_website(
            self,
            url: str,
@ -75,8 +72,8 @@ class Tools:
                        del data[key]
                # Mark URL as visited
-                self.visited.append(url)
+                self.visited.append(cleaned_url)
-                self.data.update({url: data})
+                self.data.update({cleaned_url: data})
                self.req_limit -= 1
                # Scrape all links in the page
@ -85,37 +82,46 @@ class Tools:
                return self.data
-            except Exception:
+            except Exception as e:
                print(f"Error when trying URL: {e}")
                return {}
        def clean_link(self, link) -> str | None:
            parsed_link = parse_url(link)
            """<scheme>://<netloc>/<path>;<params>?<query>#<fragment>"""
            scheme = parsed_link.scheme
            netloc = parsed_link.netloc
            path = parsed_link.path
            params = parsed_link.params
            query = parsed_link.query
            fragment = parsed_link.fragment
            if netloc is not None and netloc != "" and netloc != self.netloc:
                return None
            if path is not None and path != "":
                if path.endswith("/"):
                    path = path[:-1]
                if not path.startswith("/"):
                    path = "/" + path
            if fragment is not None or fragment != "":
                if parsed_link.fragment.endswith("/"):
                    link = link[:-1]
                if not fragment.startswith("/"):
                    fragment = "/" + fragment
            if self.netloc is None or self.scheme is None:
                return None
-            link = self.netloc + path + fragment
+            if netloc is not None and netloc != "" and netloc != self.netloc:
-            link = self.scheme + "://" + link.replace("//", "/")
+                return None
-            return link
+            # clean each part of the URL and then reconstruct it
            if scheme is None or scheme == "":
                scheme = self.scheme
            if netloc is None or netloc == "":
                netloc = self.netloc
            if path is not None and path != "":
                path = "/"+path
            if params is not None and params != "":
                params = ";" + params
            if query is not None and query != "":
                query = "?" + query
            if fragment is not None and fragment != "":
                fragment = "#" + fragment
            return f"{scheme}://{netloc}{path}{params}{query}{fragment}"
        def extract_data(self, soup: BeautifulSoup) -> dict:
            # Extract data
@ -164,7 +170,7 @@ class Tools:
        single_request: bool = Field(default=False, description="Single Request")
        pass
-    class Valves(UserValves):
+    class Valves(BaseModel):
        request_limit: int = Field(default=5, description="Request Limit")
        pass
@ -186,8 +192,10 @@ class Tools:
        if self.user_valves.single_request:
            request_limit = 1
            print("Single Request Mode")
        else:
            request_limit = self.valves.request_limit
            print(f"Request Limit Mode ({self.valves.request_limit})")
        scraper = self.RecursiveScraper(request_limit)
        data = scraper.scrape_website(url)
@ -197,4 +205,4 @@ class Tools:
 if __name__ == "__main__":
    tools = Tools()
-    print(tools.scrape_recursively("https://en.wikipedia.org/wiki/Shamisen"))
+    print(tools.scrape_recursively("https://pkg.go.dev/github.com/go-chi/chi/v5"))