Zipstack · pk-zipstack · Feb 12, 2026 · Feb 12, 2026 · Feb 12, 2026 · Feb 19, 2026
diff --git a/backend/uv.lock b/backend/uv.lock
diff --git a/platform-service/uv.lock b/platform-service/uv.lock
diff --git a/prompt-service/pyproject.toml b/prompt-service/pyproject.toml
@@ -29,8 +29,10 @@ unstract-sdk1 = { path = "../unstract/sdk1", editable = true }
 [dependency-groups]
 test = [
     "pytest~=8.0.1",
+    "pytest-asyncio>=0.23.0",
     "pytest-dotenv==0.5.2",
     "pytest-mock~=3.14.0",
+    "pytest-md-report>=0.6.2",
     "python-dotenv==1.0.1",
     "flask-WTF~=1.1",
 ]

diff --git a/prompt-service/src/unstract/prompt_service/core/retrievers/base_retriever.py b/prompt-service/src/unstract/prompt_service/core/retrievers/base_retriever.py
@@ -1,3 +1,4 @@
+from unstract.prompt_service.core.retrievers.retriever_llm import RetrieverLLM
 from unstract.sdk1.llm import LLM
 from unstract.sdk1.vector_db import VectorDB
 
@@ -23,7 +24,37 @@ def __init__(
         self.prompt = prompt
         self.doc_id = doc_id
         self.top_k = top_k
-        self.llm = llm if llm else None
+        self._llm: LLM | None = llm
+        self._retriever_llm: RetrieverLLM | None = None
+
+    @property
+    def llm(self) -> RetrieverLLM | None:
+        """Return a llama-index compatible LLM, lazily created on first access.
+
+        Avoids the cost of RetrieverLLM construction (adapter init,
+        CallbackManager setup) for retrievers that never use the LLM
+        (Simple, Automerging, Recursive).
+        """
+        if self._llm is None:
+            return None
+        if self._retriever_llm is None:
+            self._retriever_llm = RetrieverLLM(llm=self._llm)
+        return self._retriever_llm
+
+    def require_llm(self) -> RetrieverLLM:
+        """Return the llama-index LLM or raise if not configured.
+
+        Call this in retrievers that need an LLM (KeywordTable, Fusion,
+        Subquestion) to fail early with a clear message instead of
+        letting llama-index silently fall back to its default OpenAI LLM.
+        """
+        llm = self.llm
+        if llm is None:
+            raise ValueError(
+                f"{type(self).__name__} requires an LLM. "
+                "Pass llm= when constructing the retriever."
+            )
+        return llm
 
     @staticmethod
     def retrieve() -> set[str]:

diff --git a/prompt-service/src/unstract/prompt_service/core/retrievers/fusion.py b/prompt-service/src/unstract/prompt_service/core/retrievers/fusion.py
@@ -24,6 +24,7 @@ def retrieve(self) -> set[str]:
             set[str]: A set of text chunks retrieved from the database.
         """
         try:
+            llm = self.require_llm()
             logger.info(
                 f"Retrieving chunks for {self.doc_id} using LlamaIndex QueryFusionRetriever."
             )
@@ -64,7 +65,7 @@ def retrieve(self) -> set[str]:
                 mode="simple",  # Use simple fusion mode (reciprocal rank fusion)
                 use_async=False,
                 verbose=True,
-                llm=self.llm,  # LLM generates query variations
+                llm=llm,
             )
 
             # Retrieve nodes using fusion technique

diff --git a/prompt-service/src/unstract/prompt_service/core/retrievers/keyword_table.py b/prompt-service/src/unstract/prompt_service/core/retrievers/keyword_table.py
@@ -20,6 +20,7 @@ def retrieve(self) -> set[str]:
             set[str]: A set of text chunks retrieved from the database.
         """
         try:
+            llm = self.require_llm()
             logger.info(
                 f"Retrieving chunks for {self.doc_id} using LlamaIndex KeywordTableIndex."
             )
@@ -48,7 +49,7 @@ def retrieve(self) -> set[str]:
             keyword_index = KeywordTableIndex(
                 nodes=[node.node for node in all_nodes],
                 show_progress=True,
-                llm=self.llm,  # Use the provided LLM instead of defaulting to OpenAI
+                llm=llm,
             )
 
             # Create retriever from keyword index

diff --git a/prompt-service/src/unstract/prompt_service/core/retrievers/retriever_llm.py b/prompt-service/src/unstract/prompt_service/core/retrievers/retriever_llm.py
@@ -0,0 +1,126 @@
+from collections.abc import Sequence
+from typing import Any
+
+from llama_index.core.base.llms.types import (
+    ChatMessage,
+    ChatResponse,
+    ChatResponseAsyncGen,
+    ChatResponseGen,
+    CompletionResponse,
+    CompletionResponseAsyncGen,
+    CompletionResponseGen,
+    LLMMetadata,
+    MessageRole,
+)
+from llama_index.core.llms.llm import LLM as LlamaIndexBaseLLM  # noqa: N811
+from pydantic import PrivateAttr
+
+from unstract.sdk1.llm import LLM, LLMCompat
+
+
+class RetrieverLLM(LlamaIndexBaseLLM):
+    """Bridges SDK1's LLMCompat with llama-index's LLM for retriever use.
+
+    Llama-index's ``resolve_llm()`` asserts ``isinstance(llm, LLM)``
+    where ``LLM`` is ``llama_index.core.llms.llm.LLM``. Since SDK1's
+    ``LLMCompat`` is a plain class without llama-index inheritance,
+    it fails this check.
+
+    ``RetrieverLLM`` inherits from llama-index's ``LLM`` base class
+    (passing the isinstance check) and delegates all LLM calls to an
+    internal ``LLMCompat`` instance.
+    """
+
+    _compat: LLMCompat = PrivateAttr()
+
+    def __init__(self, llm: LLM, **kwargs: Any) -> None:  # noqa: ANN401
+        """Initialize with an SDK1 LLM instance."""
+        super().__init__(**kwargs)
+        self._compat = LLMCompat.from_llm(llm)
+
+    @property
+    def metadata(self) -> LLMMetadata:
+        return LLMMetadata(
+            is_chat_model=True,
+            model_name=self._compat.get_model_name(),
+        )
+
+    # ── Sync ─────────────────────────────────────────────────────────────────
+
+    def chat(
+        self,
+        messages: Sequence[ChatMessage],
+        **kwargs: Any,  # noqa: ANN401
+    ) -> ChatResponse:
+        result = self._compat.chat(messages, **kwargs)
+        return ChatResponse(
+            message=ChatMessage(
+                role=MessageRole.ASSISTANT,
+                content=result.message.content,
+            ),
+            raw=result.raw,
+        )
+
+    def complete(
+        self,
+        prompt: str,
+        formatted: bool = False,
+        **kwargs: Any,  # noqa: ANN401
+    ) -> CompletionResponse:
+        result = self._compat.complete(prompt, formatted=formatted, **kwargs)
+        return CompletionResponse(text=result.text, raw=result.raw)
+
+    def stream_chat(
+        self,
+        messages: Sequence[ChatMessage],
+        **kwargs: Any,  # noqa: ANN401
+    ) -> ChatResponseGen:
+        raise NotImplementedError("stream_chat is not supported.")
+
+    def stream_complete(
+        self,
+        prompt: str,
+        formatted: bool = False,
+        **kwargs: Any,  # noqa: ANN401
+    ) -> CompletionResponseGen:
+        raise NotImplementedError("stream_complete is not supported.")
+
+    # ── Async ────────────────────────────────────────────────────────────────
+
+    async def achat(
+        self,
+        messages: Sequence[ChatMessage],
+        **kwargs: Any,  # noqa: ANN401
+    ) -> ChatResponse:
+        result = await self._compat.achat(messages, **kwargs)
+        return ChatResponse(
+            message=ChatMessage(
+                role=MessageRole.ASSISTANT,
+                content=result.message.content,
+            ),
+            raw=result.raw,
+        )
+
+    async def acomplete(
+        self,
+        prompt: str,
+        formatted: bool = False,
+        **kwargs: Any,  # noqa: ANN401
+    ) -> CompletionResponse:
+        result = await self._compat.acomplete(prompt, formatted=formatted, **kwargs)
+        return CompletionResponse(text=result.text, raw=result.raw)
+
+    async def astream_chat(
+        self,
+        messages: Sequence[ChatMessage],
+        **kwargs: Any,  # noqa: ANN401
+    ) -> ChatResponseAsyncGen:
+        raise NotImplementedError("astream_chat is not supported.")
+
+    async def astream_complete(
+        self,
+        prompt: str,
+        formatted: bool = False,
+        **kwargs: Any,  # noqa: ANN401
+    ) -> CompletionResponseAsyncGen:
+        raise NotImplementedError("astream_complete is not supported.")
diff --git a/prompt-service/src/unstract/prompt_service/core/retrievers/subquestion.py b/prompt-service/src/unstract/prompt_service/core/retrievers/subquestion.py
@@ -1,6 +1,7 @@
 import logging
 
 from llama_index.core.query_engine import SubQuestionQueryEngine
+from llama_index.core.question_gen.llm_generators import LLMQuestionGenerator
 from llama_index.core.schema import QueryBundle
 from llama_index.core.tools import QueryEngineTool, ToolMetadata
 
@@ -22,9 +23,10 @@ def retrieve(self) -> set[str]:
             set[str]: A set of text chunks retrieved from the database.
         """
         try:
+            llm = self.require_llm()
             logger.info("Initialising vector query engine...")
             vector_query_engine = self.vector_db.get_vector_store_index().as_query_engine(
-                llm=self.llm, similarity_top_k=self.top_k
+                llm=llm, similarity_top_k=self.top_k
             )
             logger.info(
                 f"Retrieving chunks for {self.doc_id} using SubQuestionQueryEngine."
@@ -39,10 +41,14 @@ def retrieve(self) -> set[str]:
             ]
             query_bundle = QueryBundle(query_str=self.prompt)
 
+            question_gen = LLMQuestionGenerator.from_defaults(
+                llm=llm,
+            )
             query_engine = SubQuestionQueryEngine.from_defaults(
                 query_engine_tools=query_engine_tools,
+                question_gen=question_gen,
                 use_async=True,
-                llm=self.llm,
+                llm=llm,
             )
 
             response = query_engine.query(str_or_query_bundle=query_bundle)

diff --git a/prompt-service/src/unstract/prompt_service/tests/unit/__init__.py b/prompt-service/src/unstract/prompt_service/tests/unit/__init__.py
diff --git a/prompt-service/src/unstract/prompt_service/tests/unit/conftest.py b/prompt-service/src/unstract/prompt_service/tests/unit/conftest.py
@@ -0,0 +1,12 @@
+"""Pytest configuration for unit tests.
+
+Unit tests should not require external dependencies or the full app.
+This conftest intentionally does NOT import Flask app components.
+
+WARNING: This file is NOT auto-loaded when running via tox because
+--noconftest is used to skip the parent tests/conftest.py (which
+imports Flask blueprints and triggers the full adapter import chain).
+If you add shared fixtures here, either remove --noconftest from
+tox.ini and fix the parent conftest's eager imports, or define
+fixtures directly in test files.
+"""