已经可以在单一环境中运行tts与cli

2025-12-21 04:39:58 +08:00
parent 09140e4000
commit f8c2a36a2f
4 changed files with 107 additions and 56 deletions
--- a/.cursor
+++ b/.cursor
--- a/2
+++ b/2
--- a/cli.py
+++ b/cli.py
@@ -6,22 +6,22 @@ from llama_index.llms.ollama import Ollama
 from llama_index.core.chat_engine import SimpleChatEngine
 from llama_index.core.chat_engine.types import StreamingAgentChatResponse
 from llama_index.core.llms import ChatMessage
-try:
-    from llama_index.core.llms.types import ImageBlock, TextBlock
-except ImportError:
-    try:
-        # 尝试其他可能的导入路径
-        from llama_index.core import ImageBlock, TextBlock
-    except ImportError:
-        # 如果都失败，定义简单的占位类
-        class ImageBlock:
-            def __init__(self, base64_str=None, path=None):
-                self.base64_str = base64_str
-                self.path = path
-        
-        class TextBlock:
-            def __init__(self, text=""):
-                self.text = text
+# try:
+#     from llama_index.core.llms.types import ImageBlock, TextBlock
+# except ImportError:
+#     try:
+#         # 尝试其他可能的导入路径
+#         from llama_index.core import ImageBlock, TextBlock
+#     except ImportError:
+#         # 如果都失败，定义简单的占位类
+#         class ImageBlock:
+#             def __init__(self, base64_str=None, path=None):
+#                 self.base64_str = base64_str
+#                 self.path = path
+#        
+#         class TextBlock:
+#             def __init__(self, text=""):
+#                 self.text = text
 from llama_index.core import Settings
 from Convention.Runtime.File import ToolFile
 import requests
@@ -358,7 +358,7 @@ def image_file_to_base64(image_path: str) -> str:
            PrintColorful(ConsoleFrontColor.LIGHTRED_EX, f"读取图片文件失败 {image_path}: {e}")
        return None

-async def _ollama_stream_chat_with_image(llm: Ollama, messages: list, image_base64: str, end_symbol: list) -> None:
+async def _ollama_stream_chat_with_image(llm: Ollama, messages: list, image_base64: str, end_symbol: list) -> str:
    """
    直接调用 Ollama API 进行带图片的流式聊天
    
@@ -382,11 +382,15 @@ async def _ollama_stream_chat_with_image(llm: Ollama, messages: list, image_base
            "role": msg.role if hasattr(msg, 'role') else "user",
            "content": msg.content if hasattr(msg, 'content') else str(msg)
        }
-        # 如果是第一条用户消息且有图片，添加图片
-        if (hasattr(msg, 'role') and msg.role == "user") and image_base64 and len(api_messages) == 0:
-            api_msg["images"] = [image_base64]
        api_messages.append(api_msg)
    
+    # 如果有图片，添加到最后一条用户消息（当前用户消息）
+    if image_base64:
+        for i in range(len(api_messages) - 1, -1, -1):
+            if api_messages[i].get("role") == "user":
+                api_messages[i]["images"] = [image_base64]
+                break
+    
    payload = {
        "model": model,
        "messages": api_messages,
@@ -441,6 +445,8 @@ async def _ollama_stream_chat_with_image(llm: Ollama, messages: list, image_base
        if len(buffer_response) > 0:
            if TTS_ENABLE:
                await play_vocal(buffer_response)
+        
+        return buffer_response
    except Exception as e:
        PrintColorful(ConsoleFrontColor.LIGHTRED_EX, f"Ollama API 调用错误: {e}")
        if VERBOSE:
@@ -467,7 +473,7 @@ def capture_screenshot() -> str:
            PrintColorful(ConsoleFrontColor.LIGHTRED_EX, f"截图失败: {e}")
        return None

-async def achat(llm: Ollama, message: str, image_base64: Optional[str] = None, auto_screenshot: bool = False) -> None:
+async def achat(llm: Ollama, message: str, image_base64: Optional[str] = None, auto_screenshot: bool = False, conversation_history: Optional[list] = None) -> list:
    """
    使用 Ollama LLM 进行多模态聊天
    
@@ -501,16 +507,22 @@ async def achat(llm: Ollama, message: str, image_base64: Optional[str] = None, a
    # 图片将通过 Ollama 的底层 API 传递
    chat_message = ChatMessage(role="user", content=user_message)
    
-    # 构建消息列表
-    messages = [chat_message]
+    # 构建消息列表 - 使用对话历史（如果提供）
+    if conversation_history is None:
+        conversation_history = []
    
-    # 如果有系统提示，添加到消息列表开头
-    if SYSTEM_PROMPT_PATH is not None:
+    # 构建完整的消息列表（包含历史记录）
+    messages = conversation_history.copy()
+    
+    # 如果对话历史为空，且需要系统提示，添加 system 消息（只添加一次）
+    if len(messages) == 0 and SYSTEM_PROMPT_PATH is not None:
        system_prompt = ToolFile(SYSTEM_PROMPT_PATH).LoadAsText()
        if system_prompt:
-            # 将系统提示添加到用户消息中，因为 Ollama 可能不支持 system role
-            user_message = f"{system_prompt}\n\n{user_message}"
-            messages[0] = ChatMessage(role="user", content=user_message)
+            system_msg = ChatMessage(role="system", content=system_prompt)
+            messages.append(system_msg)
+    
+    # 添加当前用户消息
+    messages.append(chat_message)
    
    buffer_response = ""
    end_symbol = ['。', '？', '！']
@@ -519,33 +531,41 @@ async def achat(llm: Ollama, message: str, image_base64: Optional[str] = None, a
        # 如果有图片，需要直接调用 Ollama API，因为 llama-index 的封装可能不支持图片
        if image_base64:
            # 直接调用 Ollama 的流式 API
-            await _ollama_stream_chat_with_image(llm, messages, image_base64, end_symbol)
-            return
+            assistant_response = await _ollama_stream_chat_with_image(llm, messages, image_base64, end_symbol)
+        else:
+            # 使用流式聊天（无图片时）
+            streaming_response = await llm.astream_chat(messages)
            
-        # 使用流式聊天（无图片时）
-        streaming_response = await llm.astream_chat(messages)
+            # 实时输出流式文本
+            async for chunk in streaming_response.async_response_gen():
+                await asyncio.sleep(0.01)
+                print(chunk, end='', flush=True)
+                for ch in chunk:
+                    buffer_response += ch
+                    if len(buffer_response) > 20:
+                        if ch in end_symbol:
+                            if TTS_ENABLE:
+                                await play_vocal(buffer_response.strip())
+                            buffer_response = ""
            
-        # 实时输出流式文本
-        async for chunk in streaming_response.async_response_gen():
-            await asyncio.sleep(0.01)
-            print(chunk, end='', flush=True)
-            for ch in chunk:
-                buffer_response += ch
-                if len(buffer_response) > 20:
-                    if ch in end_symbol:
-                        if TTS_ENABLE:
-                            await play_vocal(buffer_response.strip())
-                        buffer_response = ""
+            assistant_response = buffer_response.strip()
+            if len(assistant_response) > 0:
+                if TTS_ENABLE:
+                    await play_vocal(assistant_response)
        
-        buffer_response = buffer_response.strip()
-        if len(buffer_response) > 0:
-            if TTS_ENABLE:
-                await play_vocal(buffer_response)
+        # 更新对话历史：添加用户消息和助手响应
+        updated_history = messages.copy()
+        if assistant_response:
+            assistant_msg = ChatMessage(role="assistant", content=assistant_response)
+            updated_history.append(assistant_msg)
+        
+        return updated_history
    except Exception as e:
        PrintColorful(ConsoleFrontColor.LIGHTRED_EX, f"聊天错误: {e}")
        if VERBOSE:
            import traceback
            traceback.print_exc()
+        return conversation_history if conversation_history else []


 def add_speaker() -> None:
@@ -588,6 +608,9 @@ async def event_loop(llm: Ollama) -> None:
    PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, "支持的图片格式: .png, .jpg, .jpeg")
    PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, "输入 'quit' 或 'exit' 退出\n")
    
+    # 维护对话历史，避免重复发送系统提示词
+    conversation_history = []
+    
    message = input("请开始对话: ")
    wait_second = AUTO_SPEAK_WAIT_SECOND
    try:
@@ -614,19 +637,19 @@ async def event_loop(llm: Ollama) -> None:
                    PrintColorful(ConsoleFrontColor.LIGHTGREEN_EX, "截图成功")
            
            PrintColorful(ConsoleFrontColor.GREEN, "AI: ", is_reset=False, end='')
-            await achat(llm, message, image_base64)
+            conversation_history = await achat(llm, message, image_base64, False, conversation_history)
            PrintColorful(ConsoleFrontColor.RESET,"")
            
            # 等待用户输入
            message = await ainput(wait_second)
            if not message:
                # 用户没有输入，触发 AI 自主发言（会自动截图）
-                wait_second = max(wait_second*1.5, 3600)
+                wait_second = min(wait_second*1.5, 3600)
                if VERBOSE:
                    PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, f"用户无输入，等待 {wait_second} 秒后 AI 自主发言...")
                # 触发 AI 自主发言（会自动截图）
                PrintColorful(ConsoleFrontColor.GREEN, "AI: ", is_reset=False, end='')
-                await achat(llm, "", None, auto_screenshot=True)
+                conversation_history = await achat(llm, "", None, auto_screenshot=True, conversation_history=conversation_history)
                PrintColorful(ConsoleFrontColor.RESET,"")
            else:
                wait_second = AUTO_SPEAK_WAIT_SECOND
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,12 +3,14 @@ aiofiles==23.2.1
 aiohappyeyeballs==2.6.1
 aiohttp==3.13.2
 aiosignal==1.4.0
+aiosqlite==0.22.0
 annotated-types==0.7.0
 antlr4-python3-runtime==4.9.3
 anyio==4.11.0
 async-timeout==5.0.1
 attrs==25.4.0
 audioread==3.1.0
+banks==2.2.0
 beautifulsoup4==4.14.2
 cachetools==6.2.2
 certifi==2025.11.12
@@ -21,14 +23,20 @@ conformer==0.3.2
 contourpy==1.3.2
 cycler==0.12.1
 Cython==3.2.1
+dataclasses-json==0.6.7
 decorator==5.2.1
+defusedxml==0.7.1
+Deprecated==1.2.18
 diffusers==0.29.0
+dirtyjson==1.0.8
+distro==1.9.0
 einops==0.8.1
 exceptiongroup==1.3.0
 fastapi==0.115.6
 fastapi-cli==0.0.4
 ffmpy==1.0.0
 filelock==3.20.0
+filetype==1.2.0
 flatbuffers==25.9.23
 fonttools==4.60.1
 frozenlist==1.8.0
@@ -38,6 +46,8 @@ google-auth==2.43.0
 google-auth-oauthlib==1.0.0
 gradio==5.4.0
 gradio_client==1.4.2
+greenlet==3.3.0
+griffe==1.15.0
 grpcio==1.57.0
 grpcio-tools==1.57.0
 h11==0.16.0
@@ -52,6 +62,7 @@ importlib_metadata==8.7.0
 inflect==7.3.1
 intel-openmp==2021.4.0
 Jinja2==3.1.6
+jiter==0.12.0
 joblib==1.5.2
 kaldifst==1.7.17
 kiwisolver==1.4.9
@@ -59,10 +70,13 @@ lazy_loader==0.4
 librosa==0.10.2
 lightning==2.2.4
 lightning-utilities==0.15.2
+llama-index
+llama-index-llms-ollama
 llvmlite==0.45.1
 Markdown==3.10
 markdown-it-py==4.0.0
 MarkupSafe==2.1.5
+marshmallow==3.26.1
 matplotlib==3.7.5
 mdurl==0.1.2
 mkl==2021.4.0
@@ -71,17 +85,22 @@ more-itertools==10.8.0
 mpmath==1.3.0
 msgpack==1.1.2
 multidict==6.7.0
+mypy_extensions==1.1.0
+nest-asyncio==1.6.0
 networkx==3.1
+nltk==3.9.2
 numba==0.62.1
 numpy==1.26.4
 oauthlib==3.3.1
+ollama==0.6.1
 omegaconf==2.3.0
 onnx==1.16.0
 onnxruntime==1.23.2
+openai==2.14.0
 openai-whisper==20231117
 orjson==3.11.4
 packaging==24.2
-pandas==2.3.3
+pandas==2.2.3
 pillow==11.3.0
 platformdirs==4.5.0
 pooch==1.8.2
@@ -90,15 +109,18 @@ protobuf==4.25.0
 pyarrow==18.1.0
 pyasn1==0.6.1
 pyasn1_modules==0.4.2
+PyAudio==0.2.14
 pycparser==2.23
-pydantic==2.7.0
-pydantic_core==2.18.1
+pydantic==2.12.5
+pydantic_core==2.41.5
 pydub==0.25.1
 Pygments==2.19.2
 pyparsing==3.2.5
+pypdf==6.4.2
 pyreadline3==3.5.4
 PySocks==1.7.1
 python-dateutil==2.9.0.post0
+python-dotenv==1.2.1
 python-multipart==0.0.12
 pytorch-lightning==2.5.6
 pytz==2025.2
@@ -123,9 +145,12 @@ sniffio==1.3.1
 soundfile==0.12.1
 soupsieve==2.8
 soxr==1.0.0
+SQLAlchemy==2.0.45
 starlette==0.41.3
+striprtf==0.0.26
 sympy==1.14.0
 tbb==2021.13.1
+tenacity==9.1.2
 tensorboard==2.14.0
 tensorboard-data-server==0.7.2
 threadpoolctl==3.6.0
@@ -139,6 +164,8 @@ tqdm==4.67.1
 transformers==4.51.3
 typeguard==4.4.4
 typer==0.20.0
+typing-inspect==0.9.0
+typing-inspection==0.4.2
 typing_extensions==4.15.0
 tzdata==2025.2
 urllib3==2.5.0
@@ -147,5 +174,6 @@ websockets==12.0
 Werkzeug==3.1.3
 wetext==0.0.4
 wget==3.2
+wrapt==1.17.3
 yarl==1.22.0
 zipp==3.23.0