commit 251adec7bce28cf64466e2a97c82dcf148400c7d Author: Mark Bailey Date: Sat Jan 25 17:11:13 2025 -0500 feat: initial commit diff --git a/scrape.py b/scrape.py new file mode 100644 index 0000000..5ba4671 --- /dev/null +++ b/scrape.py @@ -0,0 +1,136 @@ +import requests +from urllib.parse import urlparse as parse_url +from bs4 import BeautifulSoup + + +class Tools: + def __init__(self): + """Initialize the Tool.""" + self.citation = True + self.visited = [] + self.req_limit = 5 + self.netloc = None + self.scheme = None + self.visited = [] + self.data = {} + + def scrape_website( + self, + url: str, + ) -> dict: + """ + Scrapes data from a web page using requests and BeautifulSoup. + :params url: The URL of the web page to be scraped. Required. + """ + + if self.netloc is None: + self.netloc = parse_url(url).netloc + self.scheme = parse_url(url).scheme + + if self.req_limit == 0: + return self.data + + try: + # Send GET request + cleaned_url = self.clean_link(url) + if cleaned_url is None: + raise Exception("Invalid URL") + + response = requests.get(cleaned_url) + if not response.ok: + raise Exception("Failed to fetch URL") + + # Parse HTML content using BeautifulSoup and lxml parser + soup = BeautifulSoup(response.text, "lxml") + data = self.extract_data(soup) + + self.req_limit -= 1 + self.visited.append(url) + + self.data.update({url: data}) + + for link in data["links"]: + if link in self.visited: + continue + + self.scrape_website(link) + + return self.data + + except Exception as e: + raise e + + def clean_link(self, link) -> str | None: + parsed_link = parse_url(link) + netloc = parsed_link.netloc + path = parsed_link.path + fragment = parsed_link.fragment + + if netloc is not None and netloc != "" and netloc != self.netloc: + return None + + if path is not None and path != "": + if path.endswith("/"): + path = path[:-1] + if not path.startswith("/"): + path = "/" + path + + if fragment is not None or fragment != "": + if parsed_link.fragment.endswith("/"): + link = link[:-1] + if not fragment.startswith("/"): + fragment = "/" + fragment + + if self.netloc is None or self.scheme is None: + return None + + link = self.netloc + path + fragment + link = self.scheme + "://" + link.replace("//", "/") + + return link + + def extract_data(self, soup: BeautifulSoup) -> dict: + # Extract data + data = { + "title": soup.title.string if soup.title else None, + "headings": [], + "paragraphs": [], + "links": [], + "images": [], + } + + # Find all headings (h1 to h6) + heading_tag = ["h1", "h2", "h3", "h4", "h5", "h6"] + headings = soup.find_all(heading_tag) + if headings: + data["headings"].extend([tag.get_text() for tag in headings]) + + # Find all paragraphs + paragraphs = soup.find_all("p") + if paragraphs: + data["paragraphs"] = [p.get_text() for p in paragraphs] + + divs = soup.find_all("div") + if divs: + data["divs"] = [div.get_text() for div in divs] + + lis = soup.find_all("li") + if lis: + data["lis"] = [li.get_text() for li in lis] + + # Extract all links + links = soup.find_all("a", href=True) + if links: + data["links"] = [link["href"] for link in links if link["href"]] + + # Extract image sources + images = soup.find_all("img", src=True) + if images: + data["images"] = [img["src"] for img in images if img["src"]] + + return data + + +if __name__ == "__main__": + url = "https://docs.openwebui.com/features/plugin/" + print(Tools().scrape_website(url))