已经可以在单一环境中运行tts与cli

This commit is contained in:
2025-12-21 04:39:58 +08:00
parent 09140e4000
commit f8c2a36a2f
4 changed files with 107 additions and 56 deletions

Submodule .cursor updated: 67480b7ec2...66e8c67fc0

97
cli.py
View File

@@ -6,22 +6,22 @@ from llama_index.llms.ollama import Ollama
from llama_index.core.chat_engine import SimpleChatEngine from llama_index.core.chat_engine import SimpleChatEngine
from llama_index.core.chat_engine.types import StreamingAgentChatResponse from llama_index.core.chat_engine.types import StreamingAgentChatResponse
from llama_index.core.llms import ChatMessage from llama_index.core.llms import ChatMessage
try: # try:
from llama_index.core.llms.types import ImageBlock, TextBlock # from llama_index.core.llms.types import ImageBlock, TextBlock
except ImportError: # except ImportError:
try: # try:
# 尝试其他可能的导入路径 # # 尝试其他可能的导入路径
from llama_index.core import ImageBlock, TextBlock # from llama_index.core import ImageBlock, TextBlock
except ImportError: # except ImportError:
# 如果都失败,定义简单的占位类 # # 如果都失败,定义简单的占位类
class ImageBlock: # class ImageBlock:
def __init__(self, base64_str=None, path=None): # def __init__(self, base64_str=None, path=None):
self.base64_str = base64_str # self.base64_str = base64_str
self.path = path # self.path = path
#
class TextBlock: # class TextBlock:
def __init__(self, text=""): # def __init__(self, text=""):
self.text = text # self.text = text
from llama_index.core import Settings from llama_index.core import Settings
from Convention.Runtime.File import ToolFile from Convention.Runtime.File import ToolFile
import requests import requests
@@ -358,7 +358,7 @@ def image_file_to_base64(image_path: str) -> str:
PrintColorful(ConsoleFrontColor.LIGHTRED_EX, f"读取图片文件失败 {image_path}: {e}") PrintColorful(ConsoleFrontColor.LIGHTRED_EX, f"读取图片文件失败 {image_path}: {e}")
return None return None
async def _ollama_stream_chat_with_image(llm: Ollama, messages: list, image_base64: str, end_symbol: list) -> None: async def _ollama_stream_chat_with_image(llm: Ollama, messages: list, image_base64: str, end_symbol: list) -> str:
""" """
直接调用 Ollama API 进行带图片的流式聊天 直接调用 Ollama API 进行带图片的流式聊天
@@ -382,11 +382,15 @@ async def _ollama_stream_chat_with_image(llm: Ollama, messages: list, image_base
"role": msg.role if hasattr(msg, 'role') else "user", "role": msg.role if hasattr(msg, 'role') else "user",
"content": msg.content if hasattr(msg, 'content') else str(msg) "content": msg.content if hasattr(msg, 'content') else str(msg)
} }
# 如果是第一条用户消息且有图片,添加图片
if (hasattr(msg, 'role') and msg.role == "user") and image_base64 and len(api_messages) == 0:
api_msg["images"] = [image_base64]
api_messages.append(api_msg) api_messages.append(api_msg)
# 如果有图片,添加到最后一条用户消息(当前用户消息)
if image_base64:
for i in range(len(api_messages) - 1, -1, -1):
if api_messages[i].get("role") == "user":
api_messages[i]["images"] = [image_base64]
break
payload = { payload = {
"model": model, "model": model,
"messages": api_messages, "messages": api_messages,
@@ -441,6 +445,8 @@ async def _ollama_stream_chat_with_image(llm: Ollama, messages: list, image_base
if len(buffer_response) > 0: if len(buffer_response) > 0:
if TTS_ENABLE: if TTS_ENABLE:
await play_vocal(buffer_response) await play_vocal(buffer_response)
return buffer_response
except Exception as e: except Exception as e:
PrintColorful(ConsoleFrontColor.LIGHTRED_EX, f"Ollama API 调用错误: {e}") PrintColorful(ConsoleFrontColor.LIGHTRED_EX, f"Ollama API 调用错误: {e}")
if VERBOSE: if VERBOSE:
@@ -467,7 +473,7 @@ def capture_screenshot() -> str:
PrintColorful(ConsoleFrontColor.LIGHTRED_EX, f"截图失败: {e}") PrintColorful(ConsoleFrontColor.LIGHTRED_EX, f"截图失败: {e}")
return None return None
async def achat(llm: Ollama, message: str, image_base64: Optional[str] = None, auto_screenshot: bool = False) -> None: async def achat(llm: Ollama, message: str, image_base64: Optional[str] = None, auto_screenshot: bool = False, conversation_history: Optional[list] = None) -> list:
""" """
使用 Ollama LLM 进行多模态聊天 使用 Ollama LLM 进行多模态聊天
@@ -501,16 +507,22 @@ async def achat(llm: Ollama, message: str, image_base64: Optional[str] = None, a
# 图片将通过 Ollama 的底层 API 传递 # 图片将通过 Ollama 的底层 API 传递
chat_message = ChatMessage(role="user", content=user_message) chat_message = ChatMessage(role="user", content=user_message)
# 构建消息列表 # 构建消息列表 - 使用对话历史(如果提供)
messages = [chat_message] if conversation_history is None:
conversation_history = []
# 如果有系统提示,添加到消息列表开头 # 构建完整的消息列表(包含历史记录)
if SYSTEM_PROMPT_PATH is not None: messages = conversation_history.copy()
# 如果对话历史为空,且需要系统提示,添加 system 消息(只添加一次)
if len(messages) == 0 and SYSTEM_PROMPT_PATH is not None:
system_prompt = ToolFile(SYSTEM_PROMPT_PATH).LoadAsText() system_prompt = ToolFile(SYSTEM_PROMPT_PATH).LoadAsText()
if system_prompt: if system_prompt:
# 将系统提示添加到用户消息中,因为 Ollama 可能不支持 system role system_msg = ChatMessage(role="system", content=system_prompt)
user_message = f"{system_prompt}\n\n{user_message}" messages.append(system_msg)
messages[0] = ChatMessage(role="user", content=user_message)
# 添加当前用户消息
messages.append(chat_message)
buffer_response = "" buffer_response = ""
end_symbol = ['', '', ''] end_symbol = ['', '', '']
@@ -519,9 +531,8 @@ async def achat(llm: Ollama, message: str, image_base64: Optional[str] = None, a
# 如果有图片,需要直接调用 Ollama API因为 llama-index 的封装可能不支持图片 # 如果有图片,需要直接调用 Ollama API因为 llama-index 的封装可能不支持图片
if image_base64: if image_base64:
# 直接调用 Ollama 的流式 API # 直接调用 Ollama 的流式 API
await _ollama_stream_chat_with_image(llm, messages, image_base64, end_symbol) assistant_response = await _ollama_stream_chat_with_image(llm, messages, image_base64, end_symbol)
return else:
# 使用流式聊天(无图片时) # 使用流式聊天(无图片时)
streaming_response = await llm.astream_chat(messages) streaming_response = await llm.astream_chat(messages)
@@ -537,15 +548,24 @@ async def achat(llm: Ollama, message: str, image_base64: Optional[str] = None, a
await play_vocal(buffer_response.strip()) await play_vocal(buffer_response.strip())
buffer_response = "" buffer_response = ""
buffer_response = buffer_response.strip() assistant_response = buffer_response.strip()
if len(buffer_response) > 0: if len(assistant_response) > 0:
if TTS_ENABLE: if TTS_ENABLE:
await play_vocal(buffer_response) await play_vocal(assistant_response)
# 更新对话历史:添加用户消息和助手响应
updated_history = messages.copy()
if assistant_response:
assistant_msg = ChatMessage(role="assistant", content=assistant_response)
updated_history.append(assistant_msg)
return updated_history
except Exception as e: except Exception as e:
PrintColorful(ConsoleFrontColor.LIGHTRED_EX, f"聊天错误: {e}") PrintColorful(ConsoleFrontColor.LIGHTRED_EX, f"聊天错误: {e}")
if VERBOSE: if VERBOSE:
import traceback import traceback
traceback.print_exc() traceback.print_exc()
return conversation_history if conversation_history else []
def add_speaker() -> None: def add_speaker() -> None:
@@ -588,6 +608,9 @@ async def event_loop(llm: Ollama) -> None:
PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, "支持的图片格式: .png, .jpg, .jpeg") PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, "支持的图片格式: .png, .jpg, .jpeg")
PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, "输入 'quit''exit' 退出\n") PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, "输入 'quit''exit' 退出\n")
# 维护对话历史,避免重复发送系统提示词
conversation_history = []
message = input("请开始对话: ") message = input("请开始对话: ")
wait_second = AUTO_SPEAK_WAIT_SECOND wait_second = AUTO_SPEAK_WAIT_SECOND
try: try:
@@ -614,19 +637,19 @@ async def event_loop(llm: Ollama) -> None:
PrintColorful(ConsoleFrontColor.LIGHTGREEN_EX, "截图成功") PrintColorful(ConsoleFrontColor.LIGHTGREEN_EX, "截图成功")
PrintColorful(ConsoleFrontColor.GREEN, "AI: ", is_reset=False, end='') PrintColorful(ConsoleFrontColor.GREEN, "AI: ", is_reset=False, end='')
await achat(llm, message, image_base64) conversation_history = await achat(llm, message, image_base64, False, conversation_history)
PrintColorful(ConsoleFrontColor.RESET,"") PrintColorful(ConsoleFrontColor.RESET,"")
# 等待用户输入 # 等待用户输入
message = await ainput(wait_second) message = await ainput(wait_second)
if not message: if not message:
# 用户没有输入,触发 AI 自主发言(会自动截图) # 用户没有输入,触发 AI 自主发言(会自动截图)
wait_second = max(wait_second*1.5, 3600) wait_second = min(wait_second*1.5, 3600)
if VERBOSE: if VERBOSE:
PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, f"用户无输入,等待 {wait_second} 秒后 AI 自主发言...") PrintColorful(ConsoleFrontColor.LIGHTYELLOW_EX, f"用户无输入,等待 {wait_second} 秒后 AI 自主发言...")
# 触发 AI 自主发言(会自动截图) # 触发 AI 自主发言(会自动截图)
PrintColorful(ConsoleFrontColor.GREEN, "AI: ", is_reset=False, end='') PrintColorful(ConsoleFrontColor.GREEN, "AI: ", is_reset=False, end='')
await achat(llm, "", None, auto_screenshot=True) conversation_history = await achat(llm, "", None, auto_screenshot=True, conversation_history=conversation_history)
PrintColorful(ConsoleFrontColor.RESET,"") PrintColorful(ConsoleFrontColor.RESET,"")
else: else:
wait_second = AUTO_SPEAK_WAIT_SECOND wait_second = AUTO_SPEAK_WAIT_SECOND

View File

@@ -3,12 +3,14 @@ aiofiles==23.2.1
aiohappyeyeballs==2.6.1 aiohappyeyeballs==2.6.1
aiohttp==3.13.2 aiohttp==3.13.2
aiosignal==1.4.0 aiosignal==1.4.0
aiosqlite==0.22.0
annotated-types==0.7.0 annotated-types==0.7.0
antlr4-python3-runtime==4.9.3 antlr4-python3-runtime==4.9.3
anyio==4.11.0 anyio==4.11.0
async-timeout==5.0.1 async-timeout==5.0.1
attrs==25.4.0 attrs==25.4.0
audioread==3.1.0 audioread==3.1.0
banks==2.2.0
beautifulsoup4==4.14.2 beautifulsoup4==4.14.2
cachetools==6.2.2 cachetools==6.2.2
certifi==2025.11.12 certifi==2025.11.12
@@ -21,14 +23,20 @@ conformer==0.3.2
contourpy==1.3.2 contourpy==1.3.2
cycler==0.12.1 cycler==0.12.1
Cython==3.2.1 Cython==3.2.1
dataclasses-json==0.6.7
decorator==5.2.1 decorator==5.2.1
defusedxml==0.7.1
Deprecated==1.2.18
diffusers==0.29.0 diffusers==0.29.0
dirtyjson==1.0.8
distro==1.9.0
einops==0.8.1 einops==0.8.1
exceptiongroup==1.3.0 exceptiongroup==1.3.0
fastapi==0.115.6 fastapi==0.115.6
fastapi-cli==0.0.4 fastapi-cli==0.0.4
ffmpy==1.0.0 ffmpy==1.0.0
filelock==3.20.0 filelock==3.20.0
filetype==1.2.0
flatbuffers==25.9.23 flatbuffers==25.9.23
fonttools==4.60.1 fonttools==4.60.1
frozenlist==1.8.0 frozenlist==1.8.0
@@ -38,6 +46,8 @@ google-auth==2.43.0
google-auth-oauthlib==1.0.0 google-auth-oauthlib==1.0.0
gradio==5.4.0 gradio==5.4.0
gradio_client==1.4.2 gradio_client==1.4.2
greenlet==3.3.0
griffe==1.15.0
grpcio==1.57.0 grpcio==1.57.0
grpcio-tools==1.57.0 grpcio-tools==1.57.0
h11==0.16.0 h11==0.16.0
@@ -52,6 +62,7 @@ importlib_metadata==8.7.0
inflect==7.3.1 inflect==7.3.1
intel-openmp==2021.4.0 intel-openmp==2021.4.0
Jinja2==3.1.6 Jinja2==3.1.6
jiter==0.12.0
joblib==1.5.2 joblib==1.5.2
kaldifst==1.7.17 kaldifst==1.7.17
kiwisolver==1.4.9 kiwisolver==1.4.9
@@ -59,10 +70,13 @@ lazy_loader==0.4
librosa==0.10.2 librosa==0.10.2
lightning==2.2.4 lightning==2.2.4
lightning-utilities==0.15.2 lightning-utilities==0.15.2
llama-index
llama-index-llms-ollama
llvmlite==0.45.1 llvmlite==0.45.1
Markdown==3.10 Markdown==3.10
markdown-it-py==4.0.0 markdown-it-py==4.0.0
MarkupSafe==2.1.5 MarkupSafe==2.1.5
marshmallow==3.26.1
matplotlib==3.7.5 matplotlib==3.7.5
mdurl==0.1.2 mdurl==0.1.2
mkl==2021.4.0 mkl==2021.4.0
@@ -71,17 +85,22 @@ more-itertools==10.8.0
mpmath==1.3.0 mpmath==1.3.0
msgpack==1.1.2 msgpack==1.1.2
multidict==6.7.0 multidict==6.7.0
mypy_extensions==1.1.0
nest-asyncio==1.6.0
networkx==3.1 networkx==3.1
nltk==3.9.2
numba==0.62.1 numba==0.62.1
numpy==1.26.4 numpy==1.26.4
oauthlib==3.3.1 oauthlib==3.3.1
ollama==0.6.1
omegaconf==2.3.0 omegaconf==2.3.0
onnx==1.16.0 onnx==1.16.0
onnxruntime==1.23.2 onnxruntime==1.23.2
openai==2.14.0
openai-whisper==20231117 openai-whisper==20231117
orjson==3.11.4 orjson==3.11.4
packaging==24.2 packaging==24.2
pandas==2.3.3 pandas==2.2.3
pillow==11.3.0 pillow==11.3.0
platformdirs==4.5.0 platformdirs==4.5.0
pooch==1.8.2 pooch==1.8.2
@@ -90,15 +109,18 @@ protobuf==4.25.0
pyarrow==18.1.0 pyarrow==18.1.0
pyasn1==0.6.1 pyasn1==0.6.1
pyasn1_modules==0.4.2 pyasn1_modules==0.4.2
PyAudio==0.2.14
pycparser==2.23 pycparser==2.23
pydantic==2.7.0 pydantic==2.12.5
pydantic_core==2.18.1 pydantic_core==2.41.5
pydub==0.25.1 pydub==0.25.1
Pygments==2.19.2 Pygments==2.19.2
pyparsing==3.2.5 pyparsing==3.2.5
pypdf==6.4.2
pyreadline3==3.5.4 pyreadline3==3.5.4
PySocks==1.7.1 PySocks==1.7.1
python-dateutil==2.9.0.post0 python-dateutil==2.9.0.post0
python-dotenv==1.2.1
python-multipart==0.0.12 python-multipart==0.0.12
pytorch-lightning==2.5.6 pytorch-lightning==2.5.6
pytz==2025.2 pytz==2025.2
@@ -123,9 +145,12 @@ sniffio==1.3.1
soundfile==0.12.1 soundfile==0.12.1
soupsieve==2.8 soupsieve==2.8
soxr==1.0.0 soxr==1.0.0
SQLAlchemy==2.0.45
starlette==0.41.3 starlette==0.41.3
striprtf==0.0.26
sympy==1.14.0 sympy==1.14.0
tbb==2021.13.1 tbb==2021.13.1
tenacity==9.1.2
tensorboard==2.14.0 tensorboard==2.14.0
tensorboard-data-server==0.7.2 tensorboard-data-server==0.7.2
threadpoolctl==3.6.0 threadpoolctl==3.6.0
@@ -139,6 +164,8 @@ tqdm==4.67.1
transformers==4.51.3 transformers==4.51.3
typeguard==4.4.4 typeguard==4.4.4
typer==0.20.0 typer==0.20.0
typing-inspect==0.9.0
typing-inspection==0.4.2
typing_extensions==4.15.0 typing_extensions==4.15.0
tzdata==2025.2 tzdata==2025.2
urllib3==2.5.0 urllib3==2.5.0
@@ -147,5 +174,6 @@ websockets==12.0
Werkzeug==3.1.3 Werkzeug==3.1.3
wetext==0.0.4 wetext==0.0.4
wget==3.2 wget==3.2
wrapt==1.17.3
yarl==1.22.0 yarl==1.22.0
zipp==3.23.0 zipp==3.23.0