This commit is contained in:
Mark Bailey 2025-01-26 13:25:58 -05:00
parent 8c03c0d4bc
commit 248340badb

View File

@ -1,20 +1,24 @@
""" """
title: Scrape Recursively title: Scrape Recursively
author: Mark Bailey author: Mark Bailey
author_url: <url> author_url: https://git.markbailey.dev
git_url: https://git.markbailey.dev/cerbervs/scrape.git git_url: https://git.markbailey.dev/cerbervs/scrape.git
description: Scrapes a website recursively using requests and BeautifulSoup. description: Scrapes a website recursively using requests and BeautifulSoup.
version: 0.1.0 version: 1.0.0
licence: MIT licence: MIT
""" """
import json
import requests import requests
from urllib.parse import urlparse as parse_url from urllib.parse import urlparse as parse_url
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from pydantic import BaseModel, ConfigDict from pydantic import BaseModel, ConfigDict
class Tools(BaseModel): class Tools:
req_limit: int = 100
class RecursiveScraper(BaseModel):
citation: bool = True citation: bool = True
visited: list = [] visited: list = []
netloc: str | None = None netloc: str | None = None
@ -32,11 +36,6 @@ class Tools(BaseModel):
self, self,
url: str, url: str,
) -> dict: ) -> dict:
"""
Scrapes data from a web page using requests and BeautifulSoup.
:params url: The URL of the web page to be scraped. Required.
"""
if self.netloc is None: if self.netloc is None:
self.netloc = parse_url(url).netloc self.netloc = parse_url(url).netloc
self.scheme = parse_url(url).scheme self.scheme = parse_url(url).scheme
@ -74,8 +73,7 @@ class Tools(BaseModel):
return self.data return self.data
except Exception as e: except Exception:
print(e)
return {} return {}
def clean_link(self, link) -> str | None: def clean_link(self, link) -> str | None:
@ -148,7 +146,32 @@ class Tools(BaseModel):
return data return data
def __init__(self, req_limit: int = 10):
"""
Initializes the Tools class.
:params req_limit: The number of requests to be made to scrape the website.
"""
self.citation: bool = True
self.req_limit = req_limit
self.tool = self.RecursiveScraper(req_limit=req_limit)
def scrape_recursively(self, url: str) -> str:
"""
Scrapes data from a web page using requests and BeautifulSoup.
:params url: The URL of the web page to be scraped.
"""
data = self.tool.scrape_website(url)
self.tool.visited = []
self.tool.data = {}
self.tool.netloc = None
self.tool.scheme = None
return json.dumps(data)
if __name__ == "__main__": if __name__ == "__main__":
url = "https://docs.openwebui.com" url = "https://pkg.go.dev/github.com/go-chi/chi/v5"
print(Tools(req_limit=10).scrape_website(url)) print(Tools(req_limit=10).scrape_recursively(url))