feat: v2.1.2

* remove links from response to save space
This commit is contained in:
Mark Bailey 2025-01-26 15:56:57 -05:00
parent ea01c5ca35
commit 08e99b5b09

View File

@ -5,7 +5,7 @@ author_url: https://git.markbailey.dev/cerbervs
git_url: https://git.markbailey.dev/cerbervs/scrape.git
description: Scrapes web with option for recursive scraping.
requirements: websocket, requests, bs4, pydantic
version: 2.1.1
version: 2.1.2
licence: MIT
"""
@ -67,13 +67,20 @@ class Tools:
soup = BeautifulSoup(response, "lxml")
data = self.extract_data(soup)
links = data["links"]
keep = ["title", "headings", "paragraphs", "images"]
for key in list(data.keys()):
if key not in keep:
del data[key]
# Mark URL as visited
self.visited.append(url)
self.data.update({url: data})
self.req_limit -= 1
# Scrape all links in the page
for link in data["links"]:
for link in links:
self.scrape_website(link)
return self.data
@ -186,6 +193,9 @@ class Tools:
scraper = self.RecursiveScraper(request_limit)
data = scraper.scrape_website(url)
json_s = json.dumps(data)
scraper = None
return json_s
if __name__ == "__main__":
tools = Tools()
print(tools.scrape_recursively("https://stackoverflow.com/questions/6486450/compute-list-difference"))