import requests
from urllib.parse import urlparse as parse_url
from bs4 import BeautifulSoup


class Tools:
    def __init__(self):
        """Initialize the Tool."""
        self.citation = True
        self.visited = []
        self.req_limit = 5
        self.netloc = None
        self.scheme = None
        self.visited = []
        self.data = {}

    def scrape_website(
        self,
        url: str,
    ) -> dict:
        """
        Scrapes data from a web page using requests and BeautifulSoup.
        :params url: The URL of the web page to be scraped. Required.
        """

        if self.netloc is None:
            self.netloc = parse_url(url).netloc
            self.scheme = parse_url(url).scheme

        if self.req_limit == 0:
            return self.data

        try:
            # Send GET request
            cleaned_url = self.clean_link(url)
            if cleaned_url is None:
                raise Exception("Invalid URL")

            response = requests.get(cleaned_url)
            if not response.ok:
                raise Exception("Failed to fetch URL")

            # Parse HTML content using BeautifulSoup and lxml parser
            soup = BeautifulSoup(response.text, "lxml")
            data = self.extract_data(soup)

            self.req_limit -= 1
            self.visited.append(url)

            self.data.update({url: data})

            for link in data["links"]:
                if link in self.visited:
                    continue

                self.scrape_website(link)

            return self.data

        except Exception as e:
            raise e

    def clean_link(self, link) -> str | None:
        parsed_link = parse_url(link)
        netloc = parsed_link.netloc
        path = parsed_link.path
        fragment = parsed_link.fragment

        if netloc is not None and netloc != "" and netloc != self.netloc:
            return None

        if path is not None and path != "":
            if path.endswith("/"):
                path = path[:-1]
            if not path.startswith("/"):
                path = "/" + path

        if fragment is not None or fragment != "":
            if parsed_link.fragment.endswith("/"):
                link = link[:-1]
            if not fragment.startswith("/"):
                fragment = "/" + fragment

        if self.netloc is None or self.scheme is None:
            return None

        link = self.netloc + path + fragment
        link = self.scheme + "://" + link.replace("//", "/")

        return link

    def extract_data(self, soup: BeautifulSoup) -> dict:
        # Extract data
        data = {
            "title": soup.title.string if soup.title else None,
            "headings": [],
            "paragraphs": [],
            "links": [],
            "images": [],
        }

        # Find all headings (h1 to h6)
        heading_tag = ["h1", "h2", "h3", "h4", "h5", "h6"]
        headings = soup.find_all(heading_tag)
        if headings:
            data["headings"].extend([tag.get_text() for tag in headings])

        # Find all paragraphs
        paragraphs = soup.find_all("p")
        if paragraphs:
            data["paragraphs"] = [p.get_text() for p in paragraphs]

        divs = soup.find_all("div")
        if divs:
            data["divs"] = [div.get_text() for div in divs]

        lis = soup.find_all("li")
        if lis:
            data["lis"] = [li.get_text() for li in lis]

        # Extract all links
        links = soup.find_all("a", href=True)
        if links:
            data["links"] = [link["href"] for link in links if link["href"]]

        # Extract image sources
        images = soup.find_all("img", src=True)
        if images:
            data["images"] = [img["src"] for img in images if img["src"]]

        return data


if __name__ == "__main__":
    url = "https://docs.openwebui.com/features/plugin/"
    print(Tools().scrape_website(url))