diff --git a/.cursor b/.cursor index 67480b7..66e8c67 160000 --- a/.cursor +++ b/.cursor @@ -1 +1 @@ -Subproject commit 67480b7ec270ea5864d3d2a723e7d3cc94fd2c0a +Subproject commit 66e8c67fc09359abfeeb2f7d9cab79c486fbe738 diff --git a/Convention b/Convention index efd826f..058975c 160000 --- a/Convention +++ b/Convention @@ -1 +1 @@ -Subproject commit efd826f6777e935ca0dceabfa2fcdfbd47e1ac5a +Subproject commit 058975c37d7bc507c53247574520972fe88b1e0d diff --git a/cli.py b/cli.py index b1b3c67..0247918 100644 --- a/cli.py +++ b/cli.py @@ -6,22 +6,22 @@ from llama_index.llms.ollama import Ollama from llama_index.core.chat_engine import SimpleChatEngine from llama_index.core.chat_engine.types import StreamingAgentChatResponse from llama_index.core.llms import ChatMessage -try: - from llama_index.core.llms.types import ImageBlock, TextBlock -except ImportError: - try: - # 尝试其他可能的导入路径 - from llama_index.core import ImageBlock, TextBlock - except ImportError: - # 如果都失败,定义简单的占位类 - class ImageBlock: - def __init__(self, base64_str=None, path=None): - self.base64_str = base64_str - self.path = path - - class TextBlock: - def __init__(self, text=""): - self.text = text +# try: +# from llama_index.core.llms.types import ImageBlock, TextBlock +# except ImportError: +# try: +# # 尝试其他可能的导入路径 +# from llama_index.core import ImageBlock, TextBlock +# except ImportError: +# # 如果都失败,定义简单的占位类 +# class ImageBlock: +# def __init__(self, base64_str=None, path=None): +# self.base64_str = base64_str +# self.path = path +# +# class TextBlock: +# def __init__(self, text=""): +# self.text = text from llama_index.core import Settings from Convention.Runtime.File import ToolFile import requests @@ -358,7 +358,7 @@ def image_file_to_base64(image_path: str) -> str: PrintColorful(ConsoleFrontColor.LIGHTRED_EX, f"读取图片文件失败 {image_path}: {e}") return None -async def _ollama_stream_chat_with_image(llm: Ollama, messages: list, image_base64: str, end_symbol: list) -> None: +async def _ollama_stream_chat_with_image(llm: Ollama, messages: list, image_base64: str, end_symbol: list) -> str: """ 直接调用 Ollama API 进行带图片的流式聊天 @@ -382,11 +382,15 @@ async def _ollama_stream_chat_with_image(llm: Ollama, messages: list, image_base "role": msg.role if hasattr(msg, 'role') else "user", "content": msg.content if hasattr(msg, 'content') else str(msg) } - # 如果是第一条用户消息且有图片,添加图片 - if (hasattr(msg, 'role') and msg.role == "user") and image_base64 and len(api_messages) == 0: - api_msg["images"] = [image_base64] api_messages.append(api_msg) + # 如果有图片,添加到最后一条用户消息(当前用户消息) + if image_base64: + for i in range(len(api_messages) - 1, -1, -1): + if api_messages[i].get("role") == "user": + api_messages[i]["images"] = [image_base64] + break + payload = { "model": model, "messages": api_messages, @@ -441,6 +445,8 @@ async def _ollama_stream_chat_with_image(llm: Ollama, messages: list, image_base if len(buffer_response) > 0: if TTS_ENABLE: await play_vocal(buffer_response) + + return buffer_response except Exception as e: PrintColorful(ConsoleFrontColor.LIGHTRED_EX, f"Ollama API 调用错误: {e}") if VERBOSE: @@ -467,7 +473,7 @@ def capture_screenshot() -> str: PrintColorful(ConsoleFrontColor.LIGHTRED_EX, f"截图失败: {e}") return None -async def achat(llm: Ollama, message: str, image_base64: Optional[str] = None, auto_screenshot: bool = False) -> None: +async def achat(llm: Ollama, message: str, image_base64: Optional[str] = None, auto_screenshot: bool = False, conversation_history: Optional[list] = None) -> list: """ 使用 Ollama LLM 进行多模态聊天 @@ -501,16 +507,22 @@ async def achat(llm: Ollama, message: str, image_base64: Optional[str] = None, a # 图片将通过 Ollama 的底层 API 传递 chat_message = ChatMessage(role="user", content=user_message) - # 构建消息列表 - messages = [chat_message] + # 构建消息列表 - 使用对话历史(如果提供) + if conversation_history is None: + conversation_history = [] - # 如果有系统提示,添加到消息列表开头 - if SYSTEM_PROMPT_PATH is not None: + # 构建完整的消息列表(包含历史记录) + messages = conversation_history.copy() + + # 如果对话历史为空,且需要系统提示,添加 system 消息(只添加一次) + if len(messages) == 0 and SYSTEM_PROMPT_PATH is not None: system_prompt = ToolFile(SYSTEM_PROMPT_PATH).LoadAsText() if system_prompt: - # 将系统提示添加到用户消息中,因为 Ollama 可能不支持 system role - user_message = f"{system_prompt}\n\n{user_message}" - messages[0] = ChatMessage(role="user", content=user_message) + system_msg = ChatMessage(role="system", content=system_prompt) + messages.append(system_msg) + + # 添加当前用户消息 + messages.append(chat_message) buffer_response = "" end_symbol = ['。', '?', '!'] @@ -519,33 +531,41 @@ async def achat(llm: Ollama, message: str, image_base64: Optional[str] = None, a # 如果有图片,需要直接调用 Ollama API,因为 llama-index 的封装可能不支持图片 if image_base64: # 直接调用 Ollama 的流式 API - await _ollama_stream_chat_with_image(llm, messages, image_base64, end_symbol) - return + assistant_response = await _ollama_stream_chat_with_image(llm, messages, image_base64, end_symbol) + else: + # 使用流式聊天(无图片时) + streaming_response = await llm.astream_chat(messages) + + # 实时输出流式文本 + async for chunk in streaming_response.async_response_gen(): + await asyncio.sleep(0.01) + print(chunk, end='', flush=True) + for ch in chunk: + buffer_response += ch + if len(buffer_response) > 20: + if ch in end_symbol: + if TTS_ENABLE: + await play_vocal(buffer_response.strip()) + buffer_response = "" + + assistant_response = buffer_response.strip() + if len(assistant_response) > 0: + if TTS_ENABLE: + await play_vocal(assistant_response) - # 使用流式聊天(无图片时) - streaming_response = await llm.astream_chat(messages) + # 更新对话历史:添加用户消息和助手响应 + updated_history = messages.copy() + if assistant_response: + assistant_msg = ChatMessage(role="assistant", content=assistant_response) + updated_history.append(assistant_msg) - # 实时输出流式文本 - async for chunk in streaming_response.async_response_gen(): - await asyncio.sleep(0.01) - print(chunk, end='', flush=True) - for ch in chunk: - buffer_response += ch - if len(buffer_response) > 20: - if ch in end_symbol: - if TTS_ENABLE: - await play_vocal(buffer_response.strip()) - buffer_response = "" - - buffer_response = buffer_response.strip() - if len(buffer_response) > 0: - if TTS_ENABLE: - await play_vocal(buffer_response) + return updated_history except Exception as e: PrintColorful(ConsoleFrontColor.LIGHTRED_EX, f"聊天错误: {e}") if VERBOSE: import traceback traceback.print_exc() + return conversation_history if conversation_history else [] def add_speaker() -> None: @@ -588,6 +608,9 @@ async def event_loop(llm: Ollama) -> None: PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, "支持的图片格式: .png, .jpg, .jpeg") PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, "输入 'quit' 或 'exit' 退出\n") + # 维护对话历史,避免重复发送系统提示词 + conversation_history = [] + message = input("请开始对话: ") wait_second = AUTO_SPEAK_WAIT_SECOND try: @@ -614,19 +637,19 @@ async def event_loop(llm: Ollama) -> None: PrintColorful(ConsoleFrontColor.LIGHTGREEN_EX, "截图成功") PrintColorful(ConsoleFrontColor.GREEN, "AI: ", is_reset=False, end='') - await achat(llm, message, image_base64) + conversation_history = await achat(llm, message, image_base64, False, conversation_history) PrintColorful(ConsoleFrontColor.RESET,"") # 等待用户输入 message = await ainput(wait_second) if not message: # 用户没有输入,触发 AI 自主发言(会自动截图) - wait_second = max(wait_second*1.5, 3600) + wait_second = min(wait_second*1.5, 3600) if VERBOSE: PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, f"用户无输入,等待 {wait_second} 秒后 AI 自主发言...") # 触发 AI 自主发言(会自动截图) PrintColorful(ConsoleFrontColor.GREEN, "AI: ", is_reset=False, end='') - await achat(llm, "", None, auto_screenshot=True) + conversation_history = await achat(llm, "", None, auto_screenshot=True, conversation_history=conversation_history) PrintColorful(ConsoleFrontColor.RESET,"") else: wait_second = AUTO_SPEAK_WAIT_SECOND diff --git a/requirements.txt b/requirements.txt index 046040a..cb2ae4c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,12 +3,14 @@ aiofiles==23.2.1 aiohappyeyeballs==2.6.1 aiohttp==3.13.2 aiosignal==1.4.0 +aiosqlite==0.22.0 annotated-types==0.7.0 antlr4-python3-runtime==4.9.3 anyio==4.11.0 async-timeout==5.0.1 attrs==25.4.0 audioread==3.1.0 +banks==2.2.0 beautifulsoup4==4.14.2 cachetools==6.2.2 certifi==2025.11.12 @@ -21,14 +23,20 @@ conformer==0.3.2 contourpy==1.3.2 cycler==0.12.1 Cython==3.2.1 +dataclasses-json==0.6.7 decorator==5.2.1 +defusedxml==0.7.1 +Deprecated==1.2.18 diffusers==0.29.0 +dirtyjson==1.0.8 +distro==1.9.0 einops==0.8.1 exceptiongroup==1.3.0 fastapi==0.115.6 fastapi-cli==0.0.4 ffmpy==1.0.0 filelock==3.20.0 +filetype==1.2.0 flatbuffers==25.9.23 fonttools==4.60.1 frozenlist==1.8.0 @@ -38,6 +46,8 @@ google-auth==2.43.0 google-auth-oauthlib==1.0.0 gradio==5.4.0 gradio_client==1.4.2 +greenlet==3.3.0 +griffe==1.15.0 grpcio==1.57.0 grpcio-tools==1.57.0 h11==0.16.0 @@ -52,6 +62,7 @@ importlib_metadata==8.7.0 inflect==7.3.1 intel-openmp==2021.4.0 Jinja2==3.1.6 +jiter==0.12.0 joblib==1.5.2 kaldifst==1.7.17 kiwisolver==1.4.9 @@ -59,10 +70,13 @@ lazy_loader==0.4 librosa==0.10.2 lightning==2.2.4 lightning-utilities==0.15.2 +llama-index +llama-index-llms-ollama llvmlite==0.45.1 Markdown==3.10 markdown-it-py==4.0.0 MarkupSafe==2.1.5 +marshmallow==3.26.1 matplotlib==3.7.5 mdurl==0.1.2 mkl==2021.4.0 @@ -71,17 +85,22 @@ more-itertools==10.8.0 mpmath==1.3.0 msgpack==1.1.2 multidict==6.7.0 +mypy_extensions==1.1.0 +nest-asyncio==1.6.0 networkx==3.1 +nltk==3.9.2 numba==0.62.1 numpy==1.26.4 oauthlib==3.3.1 +ollama==0.6.1 omegaconf==2.3.0 onnx==1.16.0 onnxruntime==1.23.2 +openai==2.14.0 openai-whisper==20231117 orjson==3.11.4 packaging==24.2 -pandas==2.3.3 +pandas==2.2.3 pillow==11.3.0 platformdirs==4.5.0 pooch==1.8.2 @@ -90,15 +109,18 @@ protobuf==4.25.0 pyarrow==18.1.0 pyasn1==0.6.1 pyasn1_modules==0.4.2 +PyAudio==0.2.14 pycparser==2.23 -pydantic==2.7.0 -pydantic_core==2.18.1 +pydantic==2.12.5 +pydantic_core==2.41.5 pydub==0.25.1 Pygments==2.19.2 pyparsing==3.2.5 +pypdf==6.4.2 pyreadline3==3.5.4 PySocks==1.7.1 python-dateutil==2.9.0.post0 +python-dotenv==1.2.1 python-multipart==0.0.12 pytorch-lightning==2.5.6 pytz==2025.2 @@ -123,9 +145,12 @@ sniffio==1.3.1 soundfile==0.12.1 soupsieve==2.8 soxr==1.0.0 +SQLAlchemy==2.0.45 starlette==0.41.3 +striprtf==0.0.26 sympy==1.14.0 tbb==2021.13.1 +tenacity==9.1.2 tensorboard==2.14.0 tensorboard-data-server==0.7.2 threadpoolctl==3.6.0 @@ -139,6 +164,8 @@ tqdm==4.67.1 transformers==4.51.3 typeguard==4.4.4 typer==0.20.0 +typing-inspect==0.9.0 +typing-inspection==0.4.2 typing_extensions==4.15.0 tzdata==2025.2 urllib3==2.5.0 @@ -147,5 +174,6 @@ websockets==12.0 Werkzeug==3.1.3 wetext==0.0.4 wget==3.2 +wrapt==1.17.3 yarl==1.22.0 zipp==3.23.0