* error about __signature__ for Tool being class only
This commit is contained in:
Mark Bailey 2025-01-26 10:54:02 -05:00
parent 251adec7bc
commit 8c03c0d4bc

View File

@ -1,18 +1,32 @@
"""
title: Scrape Recursively
author: Mark Bailey
author_url: <url>
git_url: https://git.markbailey.dev/cerbervs/scrape.git
description: Scrapes a website recursively using requests and BeautifulSoup.
version: 0.1.0
licence: MIT
"""
import requests import requests
from urllib.parse import urlparse as parse_url from urllib.parse import urlparse as parse_url
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from pydantic import BaseModel, ConfigDict
class Tools: class Tools(BaseModel):
def __init__(self): citation: bool = True
"""Initialize the Tool.""" visited: list = []
self.citation = True netloc: str | None = None
self.visited = [] scheme: str | None = None
self.req_limit = 5 data: dict = {}
self.netloc = None req_limit: int = 5
self.scheme = None
self.visited = [] model_config = ConfigDict(arbitrary_types_allowed=True)
self.data = {}
def __init__(self, req_limit: int = 5):
super().__init__()
self.req_limit = req_limit
def scrape_website( def scrape_website(
self, self,
@ -31,34 +45,38 @@ class Tools:
return self.data return self.data
try: try:
# Send GET request # Clean the URL
cleaned_url = self.clean_link(url) cleaned_url = self.clean_link(url)
if cleaned_url is None: if cleaned_url is None:
raise Exception("Invalid URL") raise Exception("Invalid URL: " + url)
if cleaned_url in self.visited:
return {}
# Send GET request
response = requests.get(cleaned_url) response = requests.get(cleaned_url)
if not response.ok: if not response.ok:
raise Exception("Failed to fetch URL") self.visited.append(cleaned_url)
raise Exception("Failed to fetch URL: " + cleaned_url)
# Parse HTML content using BeautifulSoup and lxml parser # Parse HTML content using BeautifulSoup and lxml parser
soup = BeautifulSoup(response.text, "lxml") soup = BeautifulSoup(response.text, "lxml")
data = self.extract_data(soup) data = self.extract_data(soup)
self.req_limit -= 1 # Mark URL as visited
self.visited.append(url) self.visited.append(url)
self.data.update({url: data}) self.data.update({url: data})
self.req_limit -= 1
# Scrape all links in the page
for link in data["links"]: for link in data["links"]:
if link in self.visited:
continue
self.scrape_website(link) self.scrape_website(link)
return self.data return self.data
except Exception as e: except Exception as e:
raise e print(e)
return {}
def clean_link(self, link) -> str | None: def clean_link(self, link) -> str | None:
parsed_link = parse_url(link) parsed_link = parse_url(link)
@ -132,5 +150,5 @@ class Tools:
if __name__ == "__main__": if __name__ == "__main__":
url = "https://docs.openwebui.com/features/plugin/" url = "https://docs.openwebui.com"
print(Tools().scrape_website(url)) print(Tools(req_limit=10).scrape_website(url))