From ff1401761d73186074e04f28379b837662e848a7 Mon Sep 17 00:00:00 2001 From: Mark Bailey Date: Sun, 26 Jan 2025 13:51:18 -0500 Subject: [PATCH] feat: 1.0.0 --- scrape.py | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/scrape.py b/scrape.py index 4431fc8..26ab657 100644 --- a/scrape.py +++ b/scrape.py @@ -1,22 +1,14 @@ -""" -title: Scrape Recursively -author: Mark Bailey -author_url: https://git.markbailey.dev -git_url: https://git.markbailey.dev/cerbervs/scrape.git -description: Scrapes a website recursively using requests and BeautifulSoup. -version: 1.0.0 -licence: MIT -""" - import json +import websocket import requests from urllib.parse import urlparse as parse_url from bs4 import BeautifulSoup from pydantic import BaseModel, ConfigDict +DEFAULT_REQUEST_LIMIT = 50 class Tools: - req_limit: int = 100 + req_limit: int = DEFAULT_REQUEST_LIMIT class RecursiveScraper(BaseModel): citation: bool = True @@ -24,11 +16,11 @@ class Tools: netloc: str | None = None scheme: str | None = None data: dict = {} - req_limit: int = 5 + req_limit: int = DEFAULT_REQUEST_LIMIT model_config = ConfigDict(arbitrary_types_allowed=True) - def __init__(self, req_limit: int = 5): + def __init__(self, req_limit: int = DEFAULT_REQUEST_LIMIT): super().__init__() self.req_limit = req_limit @@ -52,14 +44,22 @@ class Tools: if cleaned_url in self.visited: return {} - # Send GET request - response = requests.get(cleaned_url) - if not response.ok: + # Try to send GET request using WebSocket + try: + ws = websocket.create_connection(f"ws://{self.netloc}", timeout=1) + ws.send(json.dumps({"url": cleaned_url})) + response = ws.recv() + ws.close() + except Exception: + # Fall back to requests library if WebSocket fails + response = requests.get(cleaned_url).text + + if not response: self.visited.append(cleaned_url) raise Exception("Failed to fetch URL: " + cleaned_url) # Parse HTML content using BeautifulSoup and lxml parser - soup = BeautifulSoup(response.text, "lxml") + soup = BeautifulSoup(response, "lxml") data = self.extract_data(soup) # Mark URL as visited @@ -146,7 +146,7 @@ class Tools: return data - def __init__(self, req_limit: int = 10): + def __init__(self, req_limit: int = DEFAULT_REQUEST_LIMIT): """ Initializes the Tools class. :params req_limit: The number of requests to be made to scrape the website. @@ -171,7 +171,6 @@ class Tools: return json.dumps(data) - if __name__ == "__main__": - url = "https://pkg.go.dev/github.com/go-chi/chi/v5" - print(Tools(req_limit=10).scrape_recursively(url)) + tools = Tools() + print(tools.scrape_recursively("https://pkg.go.dev/github.com/go-chi/chi/v5"))