feat: v2.1.4
* fix: url's being trashed
This commit is contained in:
parent
9c54bd2481
commit
94166a7258
62
scrape.py
62
scrape.py
@ -5,7 +5,7 @@ author_url: https://git.markbailey.dev/cerbervs
|
|||||||
git_url: https://git.markbailey.dev/cerbervs/scrape.git
|
git_url: https://git.markbailey.dev/cerbervs/scrape.git
|
||||||
description: Scrapes web with option for recursive scraping.
|
description: Scrapes web with option for recursive scraping.
|
||||||
requirements: websocket, requests, bs4, pydantic
|
requirements: websocket, requests, bs4, pydantic
|
||||||
version: 2.1.3
|
version: 2.1.4
|
||||||
licence: MIT
|
licence: MIT
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -26,9 +26,6 @@ class Tools:
|
|||||||
data: dict = {}
|
data: dict = {}
|
||||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||||
|
|
||||||
def set_req_limit(self, req_limit: int):
|
|
||||||
self.req_limit = req_limit
|
|
||||||
|
|
||||||
def scrape_website(
|
def scrape_website(
|
||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
@ -75,8 +72,8 @@ class Tools:
|
|||||||
del data[key]
|
del data[key]
|
||||||
|
|
||||||
# Mark URL as visited
|
# Mark URL as visited
|
||||||
self.visited.append(url)
|
self.visited.append(cleaned_url)
|
||||||
self.data.update({url: data})
|
self.data.update({cleaned_url: data})
|
||||||
self.req_limit -= 1
|
self.req_limit -= 1
|
||||||
|
|
||||||
# Scrape all links in the page
|
# Scrape all links in the page
|
||||||
@ -85,37 +82,46 @@ class Tools:
|
|||||||
|
|
||||||
return self.data
|
return self.data
|
||||||
|
|
||||||
except Exception:
|
except Exception as e:
|
||||||
|
print(f"Error when trying URL: {e}")
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
def clean_link(self, link) -> str | None:
|
def clean_link(self, link) -> str | None:
|
||||||
parsed_link = parse_url(link)
|
parsed_link = parse_url(link)
|
||||||
|
"""<scheme>://<netloc>/<path>;<params>?<query>#<fragment>"""
|
||||||
|
scheme = parsed_link.scheme
|
||||||
netloc = parsed_link.netloc
|
netloc = parsed_link.netloc
|
||||||
path = parsed_link.path
|
path = parsed_link.path
|
||||||
|
params = parsed_link.params
|
||||||
|
query = parsed_link.query
|
||||||
fragment = parsed_link.fragment
|
fragment = parsed_link.fragment
|
||||||
|
|
||||||
if netloc is not None and netloc != "" and netloc != self.netloc:
|
|
||||||
return None
|
|
||||||
|
|
||||||
if path is not None and path != "":
|
|
||||||
if path.endswith("/"):
|
|
||||||
path = path[:-1]
|
|
||||||
if not path.startswith("/"):
|
|
||||||
path = "/" + path
|
|
||||||
|
|
||||||
if fragment is not None or fragment != "":
|
|
||||||
if parsed_link.fragment.endswith("/"):
|
|
||||||
link = link[:-1]
|
|
||||||
if not fragment.startswith("/"):
|
|
||||||
fragment = "/" + fragment
|
|
||||||
|
|
||||||
if self.netloc is None or self.scheme is None:
|
if self.netloc is None or self.scheme is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
link = self.netloc + path + fragment
|
if netloc is not None and netloc != "" and netloc != self.netloc:
|
||||||
link = self.scheme + "://" + link.replace("//", "/")
|
return None
|
||||||
|
|
||||||
return link
|
# clean each part of the URL and then reconstruct it
|
||||||
|
if scheme is None or scheme == "":
|
||||||
|
scheme = self.scheme
|
||||||
|
|
||||||
|
if netloc is None or netloc == "":
|
||||||
|
netloc = self.netloc
|
||||||
|
|
||||||
|
if path is not None and path != "":
|
||||||
|
path = "/"+path
|
||||||
|
|
||||||
|
if params is not None and params != "":
|
||||||
|
params = ";" + params
|
||||||
|
|
||||||
|
if query is not None and query != "":
|
||||||
|
query = "?" + query
|
||||||
|
|
||||||
|
if fragment is not None and fragment != "":
|
||||||
|
fragment = "#" + fragment
|
||||||
|
|
||||||
|
return f"{scheme}://{netloc}{path}{params}{query}{fragment}"
|
||||||
|
|
||||||
def extract_data(self, soup: BeautifulSoup) -> dict:
|
def extract_data(self, soup: BeautifulSoup) -> dict:
|
||||||
# Extract data
|
# Extract data
|
||||||
@ -164,7 +170,7 @@ class Tools:
|
|||||||
single_request: bool = Field(default=False, description="Single Request")
|
single_request: bool = Field(default=False, description="Single Request")
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class Valves(UserValves):
|
class Valves(BaseModel):
|
||||||
request_limit: int = Field(default=5, description="Request Limit")
|
request_limit: int = Field(default=5, description="Request Limit")
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -186,8 +192,10 @@ class Tools:
|
|||||||
|
|
||||||
if self.user_valves.single_request:
|
if self.user_valves.single_request:
|
||||||
request_limit = 1
|
request_limit = 1
|
||||||
|
print("Single Request Mode")
|
||||||
else:
|
else:
|
||||||
request_limit = self.valves.request_limit
|
request_limit = self.valves.request_limit
|
||||||
|
print(f"Request Limit Mode ({self.valves.request_limit})")
|
||||||
|
|
||||||
scraper = self.RecursiveScraper(request_limit)
|
scraper = self.RecursiveScraper(request_limit)
|
||||||
data = scraper.scrape_website(url)
|
data = scraper.scrape_website(url)
|
||||||
@ -197,4 +205,4 @@ class Tools:
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
tools = Tools()
|
tools = Tools()
|
||||||
print(tools.scrape_recursively("https://en.wikipedia.org/wiki/Shamisen"))
|
print(tools.scrape_recursively("https://pkg.go.dev/github.com/go-chi/chi/v5"))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user