* error about __signature__ for Tool being class only
This commit is contained in:
Mark Bailey 2025-01-26 10:54:02 -05:00
parent 251adec7bc
commit 8c03c0d4bc

View File

@ -1,18 +1,32 @@
"""
title: Scrape Recursively
author: Mark Bailey
author_url: <url>
git_url: https://git.markbailey.dev/cerbervs/scrape.git
description: Scrapes a website recursively using requests and BeautifulSoup.
version: 0.1.0
licence: MIT
"""
import requests
from urllib.parse import urlparse as parse_url
from bs4 import BeautifulSoup
from pydantic import BaseModel, ConfigDict
class Tools:
def __init__(self):
"""Initialize the Tool."""
self.citation = True
self.visited = []
self.req_limit = 5
self.netloc = None
self.scheme = None
self.visited = []
self.data = {}
class Tools(BaseModel):
citation: bool = True
visited: list = []
netloc: str | None = None
scheme: str | None = None
data: dict = {}
req_limit: int = 5
model_config = ConfigDict(arbitrary_types_allowed=True)
def __init__(self, req_limit: int = 5):
super().__init__()
self.req_limit = req_limit
def scrape_website(
self,
@ -31,34 +45,38 @@ class Tools:
return self.data
try:
# Send GET request
# Clean the URL
cleaned_url = self.clean_link(url)
if cleaned_url is None:
raise Exception("Invalid URL")
raise Exception("Invalid URL: " + url)
if cleaned_url in self.visited:
return {}
# Send GET request
response = requests.get(cleaned_url)
if not response.ok:
raise Exception("Failed to fetch URL")
self.visited.append(cleaned_url)
raise Exception("Failed to fetch URL: " + cleaned_url)
# Parse HTML content using BeautifulSoup and lxml parser
soup = BeautifulSoup(response.text, "lxml")
data = self.extract_data(soup)
self.req_limit -= 1
# Mark URL as visited
self.visited.append(url)
self.data.update({url: data})
self.req_limit -= 1
# Scrape all links in the page
for link in data["links"]:
if link in self.visited:
continue
self.scrape_website(link)
return self.data
except Exception as e:
raise e
print(e)
return {}
def clean_link(self, link) -> str | None:
parsed_link = parse_url(link)
@ -132,5 +150,5 @@ class Tools:
if __name__ == "__main__":
url = "https://docs.openwebui.com/features/plugin/"
print(Tools().scrape_website(url))
url = "https://docs.openwebui.com"
print(Tools(req_limit=10).scrape_website(url))