diff --git a/Convention b/Convention index ad17b90..efd826f 160000 --- a/Convention +++ b/Convention @@ -1 +1 @@ -Subproject commit ad17b905c4889facecb3fd29e3a8f73d4aae2813 +Subproject commit efd826f6777e935ca0dceabfa2fcdfbd47e1ac5a diff --git a/README.md b/README.md index ed42d46..699c9af 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ pip install -r requirements.txt "tts_speaker_id": "speaker", "tts_timeout": 180, "sentence_max_length": 100, - "tts_enabled": true,//可选 + "tts_enable": true,//可选 "system_prompt_path": "Assets/prompt.txt", "stream_enable": false,//web_server.py不要开启 "auto_speak_wait_second": 45.0, diff --git a/cli.py b/cli.py index f376762..b1b3c67 100644 --- a/cli.py +++ b/cli.py @@ -5,6 +5,23 @@ from datetime import datetime from llama_index.llms.ollama import Ollama from llama_index.core.chat_engine import SimpleChatEngine from llama_index.core.chat_engine.types import StreamingAgentChatResponse +from llama_index.core.llms import ChatMessage +try: + from llama_index.core.llms.types import ImageBlock, TextBlock +except ImportError: + try: + # 尝试其他可能的导入路径 + from llama_index.core import ImageBlock, TextBlock + except ImportError: + # 如果都失败,定义简单的占位类 + class ImageBlock: + def __init__(self, base64_str=None, path=None): + self.base64_str = base64_str + self.path = path + + class TextBlock: + def __init__(self, text=""): + self.text = text from llama_index.core import Settings from Convention.Runtime.File import ToolFile import requests @@ -13,6 +30,12 @@ import wave import io import numpy as np import pyaudio +import base64 +import os +import re +from typing import Optional +from PIL import ImageGrab +from io import BytesIO chat_start_id = datetime.now().strftime("%Y%m%d_%H%M%S") @@ -276,28 +299,253 @@ async def ainput(wait_seconds:float) -> str: return "" -async def achat(engine:SimpleChatEngine,message:str) -> None: - user_message = message if message not in [None, ""] else "(没有人说话, 请延续发言或是寻找新的话题)" - streaming_response: StreamingAgentChatResponse = await engine.astream_chat(user_message) +def extract_image_paths(text: str) -> list[str]: + """ + 从文本中提取所有以 @ 开头的图片路径 + + Args: + text: 用户输入的文本 + + Returns: + 图片路径列表 + """ + # 匹配 @ 开头的路径,支持引号包裹的路径 + pattern = r'@["\']?([^"\'\s]+\.(?:png|jpg|jpeg))["\']?' + matches = re.findall(pattern, text, re.IGNORECASE) + valid_paths = [] + for match in matches: + if os.path.isfile(match) and match.lower().endswith(('.png', '.jpg', '.jpeg')): + valid_paths.append(match) + elif VERBOSE: + PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, f"图片路径不存在或格式不支持: {match}") + return valid_paths + +def remove_image_paths_from_text(text: str, image_paths: list[str]) -> str: + """ + 从文本中移除图片路径引用 + + Args: + text: 原始文本 + image_paths: 图片路径列表 + + Returns: + 清理后的文本 + """ + result = text + for path in image_paths: + # 移除 @path 格式的引用 + result = re.sub(rf'@["\']?{re.escape(path)}["\']?', '', result, flags=re.IGNORECASE) + # 清理多余的空格 + result = re.sub(r'\s+', ' ', result).strip() + return result + +def image_file_to_base64(image_path: str) -> str: + """ + 将图片文件转换为 base64 编码 + + Args: + image_path: 图片文件路径 + + Returns: + base64 编码的字符串 + """ + try: + with open(image_path, 'rb') as image_file: + image_data = image_file.read() + return base64.b64encode(image_data).decode('utf-8') + except Exception as e: + if VERBOSE: + PrintColorful(ConsoleFrontColor.LIGHTRED_EX, f"读取图片文件失败 {image_path}: {e}") + return None + +async def _ollama_stream_chat_with_image(llm: Ollama, messages: list, image_base64: str, end_symbol: list) -> None: + """ + 直接调用 Ollama API 进行带图片的流式聊天 + + Args: + llm: Ollama LLM 实例 + messages: 消息列表 + image_base64: base64 编码的图片 + end_symbol: 结束符号列表 + """ buffer_response = "" + import aiohttp + + # 构建请求数据 + url = f"{llm.base_url}/api/chat" if hasattr(llm, 'base_url') else f"{OLLAMA_URL}/api/chat" + model = llm.model if hasattr(llm, 'model') else OLLAMA_MODEL + + # 构建消息格式(Ollama API 格式) + api_messages = [] + for msg in messages: + api_msg = { + "role": msg.role if hasattr(msg, 'role') else "user", + "content": msg.content if hasattr(msg, 'content') else str(msg) + } + # 如果是第一条用户消息且有图片,添加图片 + if (hasattr(msg, 'role') and msg.role == "user") and image_base64 and len(api_messages) == 0: + api_msg["images"] = [image_base64] + api_messages.append(api_msg) + + payload = { + "model": model, + "messages": api_messages, + "stream": True + } + + if hasattr(llm, 'temperature') and llm.temperature: + payload["options"] = {"temperature": llm.temperature} + + try: + async with aiohttp.ClientSession() as session: + async with session.post(url, json=payload) as response: + if response.status != 200: + error_text = await response.text() + raise Exception(f"Ollama API 错误: {response.status} - {error_text}") + + # Ollama 流式响应是按行返回的 JSON + async for line_bytes in response.content: + if not line_bytes: + continue + + try: + import json + line_text = line_bytes.decode('utf-8').strip() + if not line_text: + continue + + data = json.loads(line_text) + if 'message' in data and 'content' in data['message']: + chunk = data['message']['content'] + if chunk: + await asyncio.sleep(0.01) + print(chunk, end='', flush=True) + for ch in chunk: + buffer_response += ch + if len(buffer_response) > 20: + if ch in end_symbol: + if TTS_ENABLE: + await play_vocal(buffer_response.strip()) + buffer_response = "" + + # 检查是否完成 + if data.get('done', False): + break + except json.JSONDecodeError: + continue + except Exception as e: + if VERBOSE: + PrintColorful(ConsoleFrontColor.LIGHTRED_EX, f"解析响应错误: {e}") + + buffer_response = buffer_response.strip() + if len(buffer_response) > 0: + if TTS_ENABLE: + await play_vocal(buffer_response) + except Exception as e: + PrintColorful(ConsoleFrontColor.LIGHTRED_EX, f"Ollama API 调用错误: {e}") + if VERBOSE: + import traceback + traceback.print_exc() + raise +def capture_screenshot() -> str: + """ + 截取当前屏幕并转换为 base64 编码 + + Returns: + base64 编码的截图字符串 + """ + try: + # 使用 PIL ImageGrab 截取屏幕 + screenshot = ImageGrab.grab() + buffered = BytesIO() + screenshot.save(buffered, format="PNG") + image_data = buffered.getvalue() + return base64.b64encode(image_data).decode('utf-8') + except Exception as e: + if VERBOSE: + PrintColorful(ConsoleFrontColor.LIGHTRED_EX, f"截图失败: {e}") + return None + +async def achat(llm: Ollama, message: str, image_base64: Optional[str] = None, auto_screenshot: bool = False) -> None: + """ + 使用 Ollama LLM 进行多模态聊天 + + Args: + llm: Ollama LLM 实例 + message: 文本消息 + image_base64: 可选的 base64 编码图片(单个图片) + auto_screenshot: 如果没有提供图片,是否自动截图(用于 AI 自主发言) + """ + user_message = message if message not in [None, ""] else "(没有人说话, 请延续发言或是寻找新的话题)" + is_auto_speak = user_message == "(没有人说话, 请延续发言或是寻找新的话题)" + + # 处理图片:如果提供了图片就用提供的,否则如果需要自动截图就截图 + if not image_base64 and (auto_screenshot or is_auto_speak): + # AI 自主发言时自动截图 + if VERBOSE: + PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, "AI 自主发言,自动截图...") + image_base64 = capture_screenshot() + if image_base64 and VERBOSE: + PrintColorful(ConsoleFrontColor.LIGHTGREEN_EX, "截图成功") + + # 如果有图片,确保 base64 字符串不包含 data URL 前缀 + if image_base64: + if image_base64.startswith('data:image'): + # 移除 data:image/xxx;base64, 前缀 + image_base64 = image_base64.split(',', 1)[1] + if VERBOSE: + PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, "已添加图片到消息") + + # 构建消息 - 使用 ChatMessage,但只传递文本内容 + # 图片将通过 Ollama 的底层 API 传递 + chat_message = ChatMessage(role="user", content=user_message) + + # 构建消息列表 + messages = [chat_message] + + # 如果有系统提示,添加到消息列表开头 + if SYSTEM_PROMPT_PATH is not None: + system_prompt = ToolFile(SYSTEM_PROMPT_PATH).LoadAsText() + if system_prompt: + # 将系统提示添加到用户消息中,因为 Ollama 可能不支持 system role + user_message = f"{system_prompt}\n\n{user_message}" + messages[0] = ChatMessage(role="user", content=user_message) + + buffer_response = "" end_symbol = ['。', '?', '!'] - - # 实时输出流式文本 - async for chunk in streaming_response.async_response_gen(): - await asyncio.sleep(0.01) - print(chunk, end='', flush=True) - for ch in chunk: - buffer_response += ch - if len(buffer_response) > 20: - if ch in end_symbol: - if TTS_ENABLE: - await play_vocal(buffer_response.strip()) - buffer_response = "" - buffer_response = buffer_response.strip() - if len(buffer_response) > 0: - if TTS_ENABLE: - await play_vocal(buffer_response) + + try: + # 如果有图片,需要直接调用 Ollama API,因为 llama-index 的封装可能不支持图片 + if image_base64: + # 直接调用 Ollama 的流式 API + await _ollama_stream_chat_with_image(llm, messages, image_base64, end_symbol) + return + + # 使用流式聊天(无图片时) + streaming_response = await llm.astream_chat(messages) + + # 实时输出流式文本 + async for chunk in streaming_response.async_response_gen(): + await asyncio.sleep(0.01) + print(chunk, end='', flush=True) + for ch in chunk: + buffer_response += ch + if len(buffer_response) > 20: + if ch in end_symbol: + if TTS_ENABLE: + await play_vocal(buffer_response.strip()) + buffer_response = "" + + buffer_response = buffer_response.strip() + if len(buffer_response) > 0: + if TTS_ENABLE: + await play_vocal(buffer_response) + except Exception as e: + PrintColorful(ConsoleFrontColor.LIGHTRED_EX, f"聊天错误: {e}") + if VERBOSE: + import traceback + traceback.print_exc() def add_speaker() -> None: @@ -322,22 +570,64 @@ def add_speaker() -> None: raise Exception(f"添加音色失败: {response.status_code} - {response.text}") -async def event_loop(engine:SimpleChatEngine) -> None: +async def event_loop(llm: Ollama) -> None: + """ + 事件循环,处理用户输入和AI响应 + + Args: + llm: Ollama LLM 实例 + """ if TTS_ENABLE: add_speaker() audio_player_task = asyncio.create_task(audio_player_worker()) else: audio_player_task = None + + PrintColorful(ConsoleFrontColor.LIGHTGREEN_EX, "多模态聊天已启动(支持文本和图片输入)") + PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, "提示:使用 @/path/to/image.png 来指定图片,否则将自动截图") + PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, "支持的图片格式: .png, .jpg, .jpeg") + PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, "输入 'quit' 或 'exit' 退出\n") + message = input("请开始对话: ") wait_second = AUTO_SPEAK_WAIT_SECOND try: while message != "quit" and message != "exit": + image_base64 = None + + # 检查用户输入中是否包含 @ 开头的图片路径 + image_paths = extract_image_paths(message) + + if image_paths: + # 如果找到图片路径,读取第一个图片文件 + image_path = image_paths[0] + if VERBOSE: + PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, f"检测到图片路径: {image_path}") + image_base64 = image_file_to_base64(image_path) + # 从消息中移除图片路径引用 + message = remove_image_paths_from_text(message, image_paths) + else: + # 如果没有指定图片路径,自动截图 + if VERBOSE: + PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, "未检测到图片路径,自动截图...") + image_base64 = capture_screenshot() + if image_base64 and VERBOSE: + PrintColorful(ConsoleFrontColor.LIGHTGREEN_EX, "截图成功") + PrintColorful(ConsoleFrontColor.GREEN, "AI: ", is_reset=False, end='') - await achat(engine, message) + await achat(llm, message, image_base64) PrintColorful(ConsoleFrontColor.RESET,"") + + # 等待用户输入 message = await ainput(wait_second) if not message: + # 用户没有输入,触发 AI 自主发言(会自动截图) wait_second = max(wait_second*1.5, 3600) + if VERBOSE: + PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, f"用户无输入,等待 {wait_second} 秒后 AI 自主发言...") + # 触发 AI 自主发言(会自动截图) + PrintColorful(ConsoleFrontColor.GREEN, "AI: ", is_reset=False, end='') + await achat(llm, "", None, auto_screenshot=True) + PrintColorful(ConsoleFrontColor.RESET,"") else: wait_second = AUTO_SPEAK_WAIT_SECOND finally: @@ -353,10 +643,25 @@ async def main(): try: ollama_llm = Ollama(**ollama_llm_config) Settings.llm = ollama_llm - chat_engine = SimpleChatEngine.from_defaults(**chat_engine_config) - await event_loop(chat_engine) + + # 如果有系统提示,设置到 LLM + if SYSTEM_PROMPT_PATH is not None: + system_prompt = ToolFile(SYSTEM_PROMPT_PATH).LoadAsText() + if system_prompt and hasattr(ollama_llm, 'system_prompt'): + ollama_llm.system_prompt = system_prompt + if VERBOSE: + PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, "系统提示已设置到 LLM") + + if VERBOSE: + PrintColorful(ConsoleFrontColor.LIGHTGREEN_EX, "Ollama LLM 已初始化(支持多模态)") + + # 直接使用 Ollama LLM 而不是 SimpleChatEngine + await event_loop(ollama_llm) except Exception as e: config.Log("Error", f"Error: {e}") + if VERBOSE: + import traceback + traceback.print_exc() return finally: cleanup_audio()