diff --git a/scrape.py b/scrape.py index af0606a..e882a7a 100644 --- a/scrape.py +++ b/scrape.py @@ -5,7 +5,7 @@ author_url: https://git.markbailey.dev/cerbervs git_url: https://git.markbailey.dev/cerbervs/scrape.git description: Scrapes web with option for recursive scraping. requirements: websocket, requests, bs4, pydantic -version: 2.1.1 +version: 2.1.2 licence: MIT """ @@ -67,13 +67,20 @@ class Tools: soup = BeautifulSoup(response, "lxml") data = self.extract_data(soup) + links = data["links"] + + keep = ["title", "headings", "paragraphs", "images"] + for key in list(data.keys()): + if key not in keep: + del data[key] + # Mark URL as visited self.visited.append(url) self.data.update({url: data}) self.req_limit -= 1 # Scrape all links in the page - for link in data["links"]: + for link in links: self.scrape_website(link) return self.data @@ -186,6 +193,9 @@ class Tools: scraper = self.RecursiveScraper(request_limit) data = scraper.scrape_website(url) json_s = json.dumps(data) - scraper = None return json_s + +if __name__ == "__main__": + tools = Tools() + print(tools.scrape_recursively("https://stackoverflow.com/questions/6486450/compute-list-difference"))