Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 17 additions & 2 deletions scrapegraphai/docloaders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,27 @@
"""
This module handles document loading functionalities for the ScrapeGraphAI application.

Note: ChromiumLoader and PlasmateLoader are lazy-imported to avoid triggering
torchcodec/FFmpeg DLL loading at import time (sentence_transformers -> torchcodec chain).
"""

from .browser_base import browser_base_fetch
from .chromium import ChromiumLoader
from .plasmate import PlasmateLoader
from .scrape_do import scrape_do_fetch

_LAZY_MODULES = {
"ChromiumLoader": ".chromium",
"PlasmateLoader": ".plasmate",
}


def __getattr__(name):
if name in _LAZY_MODULES:
import importlib
module = importlib.import_module(_LAZY_MODULES[name], __package__)
return getattr(module, name)
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")


__all__ = [
"browser_base_fetch",
"ChromiumLoader",
Expand Down
11 changes: 9 additions & 2 deletions scrapegraphai/docloaders/chromium.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,14 @@

import aiohttp
import async_timeout
from langchain_community.document_loaders.base import BaseLoader
from langchain_core.documents import Document

from ..utils import Proxy, dynamic_import, get_logger, parse_or_search_proxy

logger = get_logger("web-loader")


class ChromiumLoader(BaseLoader):
class ChromiumLoader:
"""Scrapes HTML pages from URLs using a (headless) instance of the
Chromium web driver with proxy protection.

Expand Down Expand Up @@ -436,6 +435,14 @@ async def ascrape_with_js_support(
finally:
await browser.close()

def load(self) -> List[Document]:
"""Load all documents synchronously."""
return list(self.lazy_load())

async def aload(self) -> List[Document]:
"""Load all documents asynchronously."""
return [doc async for doc in self.alazy_load()]

def lazy_load(self) -> Iterator[Document]:
"""
Lazily load text content from the provided URLs.
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/nodes/fetch_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import concurrent.futures

import requests
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_openai import AzureChatOpenAI, ChatOpenAI

Expand Down Expand Up @@ -182,6 +181,7 @@ def load_file_content(self, source, input_type):
"""

if input_type == "pdf":
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader(source)
# PyPDFLoader.load() can be blocking for large PDFs. Run it in a thread and
# enforce the configured timeout if provided.
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/nodes/robots_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

from langchain_core.output_parsers import CommaSeparatedListOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_community.document_loaders import AsyncChromiumLoader

from ..helpers import robots_dictionary
from ..prompts import TEMPLATE_ROBOT
Expand Down Expand Up @@ -90,6 +89,7 @@ def execute(self, state: dict) -> dict:
else:
parsed_url = urlparse(source)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
from langchain_community.document_loaders import AsyncChromiumLoader
loader = AsyncChromiumLoader(f"{base_url}/robots.txt")
document = loader.load()
if "ollama" in self.llm_model.model:
Expand Down
15 changes: 15 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,24 @@
from typing import Any, Dict
from unittest.mock import Mock

import sys
import types

import pytest
from dotenv import load_dotenv

# Mock torchcodec to prevent FFmpeg DLL crashes at import time.
# sentence_transformers -> torchcodec -> FFmpeg native DLLs can't load on some systems.
_tc = types.ModuleType("torchcodec")
_tc.__version__ = "0.0.0"
_tc.__file__ = "<mock>"
_tc.__spec__ = types.ModuleType("spec")
_tc.__spec__.name = "torchcodec"
_tc.__spec__.loader = None
_tc.__spec__.submodule_search_locations = []
if "torchcodec" not in sys.modules:
sys.modules["torchcodec"] = _tc

# Load environment variables
load_dotenv()

Expand Down
Loading