feat: v2.1.3
* make text finding recursive
This commit is contained in:
parent
08e99b5b09
commit
9c54bd2481
15
scrape.py
15
scrape.py
@ -5,7 +5,7 @@ author_url: https://git.markbailey.dev/cerbervs
|
|||||||
git_url: https://git.markbailey.dev/cerbervs/scrape.git
|
git_url: https://git.markbailey.dev/cerbervs/scrape.git
|
||||||
description: Scrapes web with option for recursive scraping.
|
description: Scrapes web with option for recursive scraping.
|
||||||
requirements: websocket, requests, bs4, pydantic
|
requirements: websocket, requests, bs4, pydantic
|
||||||
version: 2.1.2
|
version: 2.1.3
|
||||||
licence: MIT
|
licence: MIT
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@ -128,21 +128,20 @@ class Tools:
|
|||||||
}
|
}
|
||||||
|
|
||||||
# Find all headings (h1 to h6)
|
# Find all headings (h1 to h6)
|
||||||
heading_tag = ["h1", "h2", "h3", "h4", "h5", "h6"]
|
headings = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])
|
||||||
headings = soup.find_all(heading_tag)
|
|
||||||
if headings:
|
if headings:
|
||||||
data["headings"].extend([tag.get_text() for tag in headings])
|
data["headings"].extend([tag.get_text() for tag in headings])
|
||||||
|
|
||||||
# Find all paragraphs
|
# Find all paragraphs
|
||||||
paragraphs = soup.find_all("p")
|
paragraphs = soup.find_all("p", recursive=True)
|
||||||
if paragraphs:
|
if paragraphs:
|
||||||
data["paragraphs"] = [p.get_text() for p in paragraphs]
|
data["paragraphs"] = [p.get_text() for p in paragraphs]
|
||||||
|
|
||||||
divs = soup.find_all("div")
|
divs = soup.find_all("div", recursive=True)
|
||||||
if divs:
|
if divs:
|
||||||
data["divs"] = [div.get_text() for div in divs]
|
data["divs"] = [div.get_text() for div in divs]
|
||||||
|
|
||||||
lis = soup.find_all("li")
|
lis = soup.find_all("li", recursive=True)
|
||||||
if lis:
|
if lis:
|
||||||
data["lis"] = [li.get_text() for li in lis]
|
data["lis"] = [li.get_text() for li in lis]
|
||||||
|
|
||||||
@ -185,7 +184,7 @@ class Tools:
|
|||||||
:params url: The URL of the web page to be scraped.
|
:params url: The URL of the web page to be scraped.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if self.user_valves.single_request or self.valves.single_request:
|
if self.user_valves.single_request:
|
||||||
request_limit = 1
|
request_limit = 1
|
||||||
else:
|
else:
|
||||||
request_limit = self.valves.request_limit
|
request_limit = self.valves.request_limit
|
||||||
@ -198,4 +197,4 @@ class Tools:
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
tools = Tools()
|
tools = Tools()
|
||||||
print(tools.scrape_recursively("https://stackoverflow.com/questions/6486450/compute-list-difference"))
|
print(tools.scrape_recursively("https://en.wikipedia.org/wiki/Shamisen"))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user