feat: v2.1.2

* remove links from response to save space
2025-01-26 15:56:57 -05:00 · 2025-01-26 15:56:57 -05:00 · 08e99b5b09
commit 08e99b5b09
parent ea01c5ca35
1 changed files with 13 additions and 3 deletions
--- a/scrape.py
+++ b/scrape.py
@ -5,7 +5,7 @@ author_url: https://git.markbailey.dev/cerbervs
 git_url: https://git.markbailey.dev/cerbervs/scrape.git
 description: Scrapes web with option for recursive scraping.
 requirements: websocket, requests, bs4, pydantic
-version: 2.1.1
+version: 2.1.2
 licence: MIT
 """

@ -67,13 +67,20 @@ class Tools:
                soup = BeautifulSoup(response, "lxml")
                data = self.extract_data(soup)

+                links = data["links"]
+
+                keep = ["title", "headings", "paragraphs", "images"]
+                for key in list(data.keys()):
+                    if key not in keep:
+                        del data[key]
+
                # Mark URL as visited
                self.visited.append(url)
                self.data.update({url: data})
                self.req_limit -= 1

                # Scrape all links in the page
-                for link in data["links"]:
+                for link in links:
                    self.scrape_website(link)

                return self.data
@ -186,6 +193,9 @@ class Tools:
        scraper = self.RecursiveScraper(request_limit)
        data = scraper.scrape_website(url)
        json_s = json.dumps(data)
-        scraper = None

        return json_s
+
+if __name__ == "__main__":
+    tools = Tools()
+    print(tools.scrape_recursively("https://stackoverflow.com/questions/6486450/compute-list-difference"))