feat: v2.1.2
* remove links from response to save space
This commit is contained in:
parent
ea01c5ca35
commit
08e99b5b09
16
scrape.py
16
scrape.py
@ -5,7 +5,7 @@ author_url: https://git.markbailey.dev/cerbervs
|
||||
git_url: https://git.markbailey.dev/cerbervs/scrape.git
|
||||
description: Scrapes web with option for recursive scraping.
|
||||
requirements: websocket, requests, bs4, pydantic
|
||||
version: 2.1.1
|
||||
version: 2.1.2
|
||||
licence: MIT
|
||||
"""
|
||||
|
||||
@ -67,13 +67,20 @@ class Tools:
|
||||
soup = BeautifulSoup(response, "lxml")
|
||||
data = self.extract_data(soup)
|
||||
|
||||
links = data["links"]
|
||||
|
||||
keep = ["title", "headings", "paragraphs", "images"]
|
||||
for key in list(data.keys()):
|
||||
if key not in keep:
|
||||
del data[key]
|
||||
|
||||
# Mark URL as visited
|
||||
self.visited.append(url)
|
||||
self.data.update({url: data})
|
||||
self.req_limit -= 1
|
||||
|
||||
# Scrape all links in the page
|
||||
for link in data["links"]:
|
||||
for link in links:
|
||||
self.scrape_website(link)
|
||||
|
||||
return self.data
|
||||
@ -186,6 +193,9 @@ class Tools:
|
||||
scraper = self.RecursiveScraper(request_limit)
|
||||
data = scraper.scrape_website(url)
|
||||
json_s = json.dumps(data)
|
||||
scraper = None
|
||||
|
||||
return json_s
|
||||
|
||||
if __name__ == "__main__":
|
||||
tools = Tools()
|
||||
print(tools.scrape_recursively("https://stackoverflow.com/questions/6486450/compute-list-difference"))
|
||||
|
Loading…
x
Reference in New Issue
Block a user