From 9c54bd2481c7cecf07b04a2f90a33389eb93c7b5 Mon Sep 17 00:00:00 2001 From: Mark Bailey Date: Sun, 26 Jan 2025 16:37:36 -0500 Subject: [PATCH] feat: v2.1.3 * make text finding recursive --- scrape.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/scrape.py b/scrape.py index e882a7a..4b54a06 100644 --- a/scrape.py +++ b/scrape.py @@ -5,7 +5,7 @@ author_url: https://git.markbailey.dev/cerbervs git_url: https://git.markbailey.dev/cerbervs/scrape.git description: Scrapes web with option for recursive scraping. requirements: websocket, requests, bs4, pydantic -version: 2.1.2 +version: 2.1.3 licence: MIT """ @@ -128,21 +128,20 @@ class Tools: } # Find all headings (h1 to h6) - heading_tag = ["h1", "h2", "h3", "h4", "h5", "h6"] - headings = soup.find_all(heading_tag) + headings = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]) if headings: data["headings"].extend([tag.get_text() for tag in headings]) # Find all paragraphs - paragraphs = soup.find_all("p") + paragraphs = soup.find_all("p", recursive=True) if paragraphs: data["paragraphs"] = [p.get_text() for p in paragraphs] - divs = soup.find_all("div") + divs = soup.find_all("div", recursive=True) if divs: data["divs"] = [div.get_text() for div in divs] - lis = soup.find_all("li") + lis = soup.find_all("li", recursive=True) if lis: data["lis"] = [li.get_text() for li in lis] @@ -185,7 +184,7 @@ class Tools: :params url: The URL of the web page to be scraped. """ - if self.user_valves.single_request or self.valves.single_request: + if self.user_valves.single_request: request_limit = 1 else: request_limit = self.valves.request_limit @@ -198,4 +197,4 @@ class Tools: if __name__ == "__main__": tools = Tools() - print(tools.scrape_recursively("https://stackoverflow.com/questions/6486450/compute-list-difference")) + print(tools.scrape_recursively("https://en.wikipedia.org/wiki/Shamisen"))