feat: v2.1.2
* remove links from response to save space
This commit is contained in:
parent
ea01c5ca35
commit
08e99b5b09
16
scrape.py
16
scrape.py
@ -5,7 +5,7 @@ author_url: https://git.markbailey.dev/cerbervs
|
|||||||
git_url: https://git.markbailey.dev/cerbervs/scrape.git
|
git_url: https://git.markbailey.dev/cerbervs/scrape.git
|
||||||
description: Scrapes web with option for recursive scraping.
|
description: Scrapes web with option for recursive scraping.
|
||||||
requirements: websocket, requests, bs4, pydantic
|
requirements: websocket, requests, bs4, pydantic
|
||||||
version: 2.1.1
|
version: 2.1.2
|
||||||
licence: MIT
|
licence: MIT
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -67,13 +67,20 @@ class Tools:
|
|||||||
soup = BeautifulSoup(response, "lxml")
|
soup = BeautifulSoup(response, "lxml")
|
||||||
data = self.extract_data(soup)
|
data = self.extract_data(soup)
|
||||||
|
|
||||||
|
links = data["links"]
|
||||||
|
|
||||||
|
keep = ["title", "headings", "paragraphs", "images"]
|
||||||
|
for key in list(data.keys()):
|
||||||
|
if key not in keep:
|
||||||
|
del data[key]
|
||||||
|
|
||||||
# Mark URL as visited
|
# Mark URL as visited
|
||||||
self.visited.append(url)
|
self.visited.append(url)
|
||||||
self.data.update({url: data})
|
self.data.update({url: data})
|
||||||
self.req_limit -= 1
|
self.req_limit -= 1
|
||||||
|
|
||||||
# Scrape all links in the page
|
# Scrape all links in the page
|
||||||
for link in data["links"]:
|
for link in links:
|
||||||
self.scrape_website(link)
|
self.scrape_website(link)
|
||||||
|
|
||||||
return self.data
|
return self.data
|
||||||
@ -186,6 +193,9 @@ class Tools:
|
|||||||
scraper = self.RecursiveScraper(request_limit)
|
scraper = self.RecursiveScraper(request_limit)
|
||||||
data = scraper.scrape_website(url)
|
data = scraper.scrape_website(url)
|
||||||
json_s = json.dumps(data)
|
json_s = json.dumps(data)
|
||||||
scraper = None
|
|
||||||
|
|
||||||
return json_s
|
return json_s
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
tools = Tools()
|
||||||
|
print(tools.scrape_recursively("https://stackoverflow.com/questions/6486450/compute-list-difference"))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user