import requests from urllib.parse import urlparse as parse_url from bs4 import BeautifulSoup class Tools: def __init__(self): """Initialize the Tool.""" self.citation = True self.visited = [] self.req_limit = 5 self.netloc = None self.scheme = None self.visited = [] self.data = {} def scrape_website( self, url: str, ) -> dict: """ Scrapes data from a web page using requests and BeautifulSoup. :params url: The URL of the web page to be scraped. Required. """ if self.netloc is None: self.netloc = parse_url(url).netloc self.scheme = parse_url(url).scheme if self.req_limit == 0: return self.data try: # Send GET request cleaned_url = self.clean_link(url) if cleaned_url is None: raise Exception("Invalid URL") response = requests.get(cleaned_url) if not response.ok: raise Exception("Failed to fetch URL") # Parse HTML content using BeautifulSoup and lxml parser soup = BeautifulSoup(response.text, "lxml") data = self.extract_data(soup) self.req_limit -= 1 self.visited.append(url) self.data.update({url: data}) for link in data["links"]: if link in self.visited: continue self.scrape_website(link) return self.data except Exception as e: raise e def clean_link(self, link) -> str | None: parsed_link = parse_url(link) netloc = parsed_link.netloc path = parsed_link.path fragment = parsed_link.fragment if netloc is not None and netloc != "" and netloc != self.netloc: return None if path is not None and path != "": if path.endswith("/"): path = path[:-1] if not path.startswith("/"): path = "/" + path if fragment is not None or fragment != "": if parsed_link.fragment.endswith("/"): link = link[:-1] if not fragment.startswith("/"): fragment = "/" + fragment if self.netloc is None or self.scheme is None: return None link = self.netloc + path + fragment link = self.scheme + "://" + link.replace("//", "/") return link def extract_data(self, soup: BeautifulSoup) -> dict: # Extract data data = { "title": soup.title.string if soup.title else None, "headings": [], "paragraphs": [], "links": [], "images": [], } # Find all headings (h1 to h6) heading_tag = ["h1", "h2", "h3", "h4", "h5", "h6"] headings = soup.find_all(heading_tag) if headings: data["headings"].extend([tag.get_text() for tag in headings]) # Find all paragraphs paragraphs = soup.find_all("p") if paragraphs: data["paragraphs"] = [p.get_text() for p in paragraphs] divs = soup.find_all("div") if divs: data["divs"] = [div.get_text() for div in divs] lis = soup.find_all("li") if lis: data["lis"] = [li.get_text() for li in lis] # Extract all links links = soup.find_all("a", href=True) if links: data["links"] = [link["href"] for link in links if link["href"]] # Extract image sources images = soup.find_all("img", src=True) if images: data["images"] = [img["src"] for img in images if img["src"]] return data if __name__ == "__main__": url = "https://docs.openwebui.com/features/plugin/" print(Tools().scrape_website(url))