v0.12.22 (#17969)

logan-markewich · web-flow · commit 581561333594 · 2025-02-28T20:32:58.000-06:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,19 @@
 # ChangeLog
 
+## [2025-02-28]
+
+### `llama-index-core` [0.12.22]
+
+- fix agentworkflow tool call tracking on final response (#17968)
+
+### `llama-index-readers-github` [0.6.0]
+
+- Ensure that Github reader uses timeout and retries params (#17959)
+
+### `llama-index-readers-web` [0.3.7]
+
+- chore: update FireCrawlWebReader integration to support extract (#17957)
+
 ## [2025-02-27]
 
 ### `llama-index-core` [0.12.21]
diff --git a/docs/docs/CHANGELOG.md b/docs/docs/CHANGELOG.md
@@ -1,5 +1,19 @@
 # ChangeLog
 
+## [2025-02-28]
+
+### `llama-index-core` [0.12.22]
+
+- fix agentworkflow tool call tracking on final response (#17968)
+
+### `llama-index-readers-github` [0.6.0]
+
+- Ensure that Github reader uses timeout and retries params (#17959)
+
+### `llama-index-readers-web` [0.3.7]
+
+- chore: update FireCrawlWebReader integration to support extract (#17957)
+
 ## [2025-02-27]
 
 ### `llama-index-core` [0.12.21]
@@ -10,7 +24,7 @@
 - Feature/remove retriever tool template override (#17909)
 - only modify delta if 'Answer:' was actually detected (#17901)
 - Fix CitationQueryEngine init function for response_synthesizer (#17897)
-- fix ChatSummaryMemoryBuffer._summarize_oldest_chat_history (#17845)
+- fix ChatSummaryMemoryBuffer.\_summarize_oldest_chat_history (#17845)
 - fix: make base64 detection more robust across the board (#17930)
 - fix: stepwise execution breaks when steps do async work (#17914)
 - safer workflow cancel + fix restored context bug (#17938)
diff --git a/llama-index-core/llama_index/core/__init__.py b/llama-index-core/llama_index/core/__init__.py
@@ -1,6 +1,6 @@
 """Init file of LlamaIndex."""
 
-__version__ = "0.12.21"
+__version__ = "0.12.22"
 
 import logging
 from logging import NullHandler
diff --git a/llama-index-core/pyproject.toml b/llama-index-core/pyproject.toml
@@ -46,7 +46,7 @@ name = "llama-index-core"
 packages = [{include = "llama_index"}]
 readme = "README.md"
 repository = "https://github.com/run-llama/llama_index"
-version = "0.12.21"
+version = "0.12.22"
 
 [tool.poetry.dependencies]
 SQLAlchemy = {extras = ["asyncio"], version = ">=1.4.49"}
diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/base.py b/llama-index-integrations/llms/llama-index-llms-huggingface-api/llama_index/llms/huggingface_api/base.py
@@ -173,10 +173,13 @@ def __init__(self, **kwargs: Any) -> None:
         self._sync_client = InferenceClient(**self._get_inference_client_kwargs())
         self._async_client = AsyncInferenceClient(**self._get_inference_client_kwargs())
 
-        # set context window if not provided
-        info = self._sync_client.get_endpoint_info()
-        if "max_input_tokens" in info and kwargs.get("context_window") is None:
-            self.context_window = info["max_input_tokens"]
+        # set context window if not provided, if we can get the endpoint info
+        try:
+            info = self._sync_client.get_endpoint_info()
+            if "max_input_tokens" in info and kwargs.get("context_window") is None:
+                self.context_window = info["max_input_tokens"]
+        except Exception:
+            pass
 
     def _get_inference_client_kwargs(self) -> Dict[str, Any]:
         """Extract the Hugging Face InferenceClient construction parameters."""
@@ -224,7 +227,7 @@ def _to_huggingface_messages(
 
     def _parse_streaming_tool_calls(
         self, tool_call_strs: List[str]
-    ) -> List[ToolSelection | str]:
+    ) -> List[Union[ToolSelection, str]]:
         tool_calls = []
         # Try to parse into complete objects, otherwise keep as strings
         for tool_call_str in tool_call_strs:
diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-huggingface-api/pyproject.toml
@@ -27,7 +27,7 @@ exclude = ["**/BUILD"]
 license = "MIT"
 name = "llama-index-llms-huggingface-api"
 readme = "README.md"
-version = "0.4.0"
+version = "0.4.1"
 
 [tool.poetry.dependencies]
 python = ">=3.9,<4.0"
diff --git a/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/test_huggingface_api.py b/llama-index-integrations/llms/llama-index-llms-huggingface-api/tests/test_huggingface_api.py
@@ -3,8 +3,9 @@
 import pytest
 from llama_index.core.llms import ChatMessage, MessageRole
 from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
+from huggingface_hub.inference._generated.types import ChatCompletionOutput
 
-STUB_MODEL_NAME = "placeholder_model"
+STUB_MODEL_NAME = "microsoft/Phi-4-multimodal-instruct"
 
 
 @pytest.fixture(name="hf_inference_api")
@@ -45,15 +46,17 @@ def test_chat(self, hf_inference_api: HuggingFaceInferenceAPI) -> None:
         generated_response = (
             " It's based on the book of the same name by James Fenimore Cooper."
         )
-        conversational_return = {
-            "choices": [
-                {
-                    "message": {
-                        "content": generated_response,
+        conversational_return = ChatCompletionOutput.parse_obj(
+            {
+                "choices": [
+                    {
+                        "message": {
+                            "content": generated_response,
+                        }
                     }
-                }
-            ],
-        }
+                ],
+            }
+        )
 
         with patch.object(
             hf_inference_api._sync_client,
@@ -67,6 +70,8 @@ def test_chat(self, hf_inference_api: HuggingFaceInferenceAPI) -> None:
         mock_conversational.assert_called_once_with(
             messages=[{"role": m.role.value, "content": m.content} for m in messages],
             model=STUB_MODEL_NAME,
+            temperature=0.1,
+            max_tokens=256,
         )
 
     def test_chat_text_generation(
@@ -97,6 +102,8 @@ def test_chat_text_generation(
         assert response.message.content == conversational_return
         mock_complete.assert_called_once_with(
             "System: You are an expert movie reviewer\nUser: Which movie is the best?\nAssistant:",
+            model=STUB_MODEL_NAME,
+            temperature=0.1,
             max_new_tokens=256,
         )
 
@@ -109,5 +116,7 @@ def test_complete(self, hf_inference_api: HuggingFaceInferenceAPI) -> None:
             return_value=generated_text,
         ) as mock_text_generation:
             response = hf_inference_api.complete(prompt)
-        mock_text_generation.assert_called_once_with(prompt, max_new_tokens=256)
+        mock_text_generation.assert_called_once_with(
+            prompt, model=STUB_MODEL_NAME, temperature=0.1, max_new_tokens=256
+        )
         assert response.text == generated_text
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -45,7 +45,7 @@ name = "llama-index"
 packages = [{from = "_llama-index", include = "llama_index"}]
 readme = "README.md"
 repository = "https://github.com/run-llama/llama_index"
-version = "0.12.21"
+version = "0.12.22"
 
 [tool.poetry.dependencies]
 python = ">=3.9,<4.0"
@@ -57,7 +57,7 @@ llama-index-agent-openai = "^0.4.0"
 llama-index-readers-file = "^0.4.0"
 llama-index-readers-llama-parse = ">=0.4.0"
 llama-index-indices-managed-llama-cloud = ">=0.4.0"
-llama-index-core = "^0.12.21"
+llama-index-core = "^0.12.22"
 llama-index-multi-modal-llms-openai = "^0.4.0"
 llama-index-cli = "^0.4.1"
 nltk = ">3.8.1"  # avoids a CVE, temp until next release, should be in llama-index-core