feat: v2.1.4

* fix: url's being trashed
This commit is contained in:
Mark Bailey 2025-01-26 17:23:22 -05:00
parent 9c54bd2481
commit 94166a7258

View File

@ -5,7 +5,7 @@ author_url: https://git.markbailey.dev/cerbervs
git_url: https://git.markbailey.dev/cerbervs/scrape.git
description: Scrapes web with option for recursive scraping.
requirements: websocket, requests, bs4, pydantic
version: 2.1.3
version: 2.1.4
licence: MIT
"""
@ -26,9 +26,6 @@ class Tools:
data: dict = {}
model_config = ConfigDict(arbitrary_types_allowed=True)
def set_req_limit(self, req_limit: int):
self.req_limit = req_limit
def scrape_website(
self,
url: str,
@ -75,8 +72,8 @@ class Tools:
del data[key]
# Mark URL as visited
self.visited.append(url)
self.data.update({url: data})
self.visited.append(cleaned_url)
self.data.update({cleaned_url: data})
self.req_limit -= 1
# Scrape all links in the page
@ -85,37 +82,46 @@ class Tools:
return self.data
except Exception:
except Exception as e:
print(f"Error when trying URL: {e}")
return {}
def clean_link(self, link) -> str | None:
parsed_link = parse_url(link)
"""<scheme>://<netloc>/<path>;<params>?<query>#<fragment>"""
scheme = parsed_link.scheme
netloc = parsed_link.netloc
path = parsed_link.path
params = parsed_link.params
query = parsed_link.query
fragment = parsed_link.fragment
if netloc is not None and netloc != "" and netloc != self.netloc:
return None
if path is not None and path != "":
if path.endswith("/"):
path = path[:-1]
if not path.startswith("/"):
path = "/" + path
if fragment is not None or fragment != "":
if parsed_link.fragment.endswith("/"):
link = link[:-1]
if not fragment.startswith("/"):
fragment = "/" + fragment
if self.netloc is None or self.scheme is None:
return None
link = self.netloc + path + fragment
link = self.scheme + "://" + link.replace("//", "/")
if netloc is not None and netloc != "" and netloc != self.netloc:
return None
return link
# clean each part of the URL and then reconstruct it
if scheme is None or scheme == "":
scheme = self.scheme
if netloc is None or netloc == "":
netloc = self.netloc
if path is not None and path != "":
path = "/"+path
if params is not None and params != "":
params = ";" + params
if query is not None and query != "":
query = "?" + query
if fragment is not None and fragment != "":
fragment = "#" + fragment
return f"{scheme}://{netloc}{path}{params}{query}{fragment}"
def extract_data(self, soup: BeautifulSoup) -> dict:
# Extract data
@ -164,7 +170,7 @@ class Tools:
single_request: bool = Field(default=False, description="Single Request")
pass
class Valves(UserValves):
class Valves(BaseModel):
request_limit: int = Field(default=5, description="Request Limit")
pass
@ -186,8 +192,10 @@ class Tools:
if self.user_valves.single_request:
request_limit = 1
print("Single Request Mode")
else:
request_limit = self.valves.request_limit
print(f"Request Limit Mode ({self.valves.request_limit})")
scraper = self.RecursiveScraper(request_limit)
data = scraper.scrape_website(url)
@ -197,4 +205,4 @@ class Tools:
if __name__ == "__main__":
tools = Tools()
print(tools.scrape_recursively("https://en.wikipedia.org/wiki/Shamisen"))
print(tools.scrape_recursively("https://pkg.go.dev/github.com/go-chi/chi/v5"))