From 94166a7258231650c46defc9b6511f4c63d74324 Mon Sep 17 00:00:00 2001 From: Mark Bailey Date: Sun, 26 Jan 2025 17:23:22 -0500 Subject: [PATCH] feat: v2.1.4 * fix: url's being trashed --- scrape.py | 62 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 27 deletions(-) diff --git a/scrape.py b/scrape.py index 4b54a06..afe648b 100644 --- a/scrape.py +++ b/scrape.py @@ -5,7 +5,7 @@ author_url: https://git.markbailey.dev/cerbervs git_url: https://git.markbailey.dev/cerbervs/scrape.git description: Scrapes web with option for recursive scraping. requirements: websocket, requests, bs4, pydantic -version: 2.1.3 +version: 2.1.4 licence: MIT """ @@ -26,9 +26,6 @@ class Tools: data: dict = {} model_config = ConfigDict(arbitrary_types_allowed=True) - def set_req_limit(self, req_limit: int): - self.req_limit = req_limit - def scrape_website( self, url: str, @@ -75,8 +72,8 @@ class Tools: del data[key] # Mark URL as visited - self.visited.append(url) - self.data.update({url: data}) + self.visited.append(cleaned_url) + self.data.update({cleaned_url: data}) self.req_limit -= 1 # Scrape all links in the page @@ -85,37 +82,46 @@ class Tools: return self.data - except Exception: + except Exception as e: + print(f"Error when trying URL: {e}") return {} def clean_link(self, link) -> str | None: parsed_link = parse_url(link) + """:///;?#""" + scheme = parsed_link.scheme netloc = parsed_link.netloc path = parsed_link.path + params = parsed_link.params + query = parsed_link.query fragment = parsed_link.fragment - if netloc is not None and netloc != "" and netloc != self.netloc: - return None - - if path is not None and path != "": - if path.endswith("/"): - path = path[:-1] - if not path.startswith("/"): - path = "/" + path - - if fragment is not None or fragment != "": - if parsed_link.fragment.endswith("/"): - link = link[:-1] - if not fragment.startswith("/"): - fragment = "/" + fragment - if self.netloc is None or self.scheme is None: return None - link = self.netloc + path + fragment - link = self.scheme + "://" + link.replace("//", "/") + if netloc is not None and netloc != "" and netloc != self.netloc: + return None - return link + # clean each part of the URL and then reconstruct it + if scheme is None or scheme == "": + scheme = self.scheme + + if netloc is None or netloc == "": + netloc = self.netloc + + if path is not None and path != "": + path = "/"+path + + if params is not None and params != "": + params = ";" + params + + if query is not None and query != "": + query = "?" + query + + if fragment is not None and fragment != "": + fragment = "#" + fragment + + return f"{scheme}://{netloc}{path}{params}{query}{fragment}" def extract_data(self, soup: BeautifulSoup) -> dict: # Extract data @@ -164,7 +170,7 @@ class Tools: single_request: bool = Field(default=False, description="Single Request") pass - class Valves(UserValves): + class Valves(BaseModel): request_limit: int = Field(default=5, description="Request Limit") pass @@ -186,8 +192,10 @@ class Tools: if self.user_valves.single_request: request_limit = 1 + print("Single Request Mode") else: request_limit = self.valves.request_limit + print(f"Request Limit Mode ({self.valves.request_limit})") scraper = self.RecursiveScraper(request_limit) data = scraper.scrape_website(url) @@ -197,4 +205,4 @@ class Tools: if __name__ == "__main__": tools = Tools() - print(tools.scrape_recursively("https://en.wikipedia.org/wiki/Shamisen")) + print(tools.scrape_recursively("https://pkg.go.dev/github.com/go-chi/chi/v5"))