import requests
import json
import csv
import time
from typing import List, Dict
from concurrent.futures import ThreadPoolExecutor, as_completed
# 配置参数
API_URL = "api的URL"
API_KEY = "大模型令牌"
JSON_PATH = r"JSON文件格式的切片数据"
CSV_PATH = r"你的csv文件保存地址"
MODEL_NAME = "大模型调用"
START_INDEX = 4 # 从第5个片段开始(索引从0开始)
THREADS = 10 # 并发线程数
RETRIES = 3 # 失败重试次数
def load_json_fragments(file_path: str) -> List[str]:
"""加载并验证JSON文件"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
if isinstance(data, list):
return [str(item) for item in data if item]
elif isinstance(data, dict):
return [str(value) for value in data.values() if value]
else:
raise ValueError("不支持的JSON结构,应提供数组或字典格式")
except Exception as e:
print(f"加载JSON文件失败: {str(e)}")
return []
def generate_question_with_retry(fragment: str) -> str:
"""带重试机制的问题生成"""
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {API_KEY}"
}
messages = [
{"role": "system", "content": "你是一个专业的问题生成器。根据提供的文本片段生成一个相关问题。"},
{"role": "user", "content": f"根据以下片段:\n{fragment[:2000]}\n生成一个相关问题"}
]
payload = {
"model": MODEL_NAME,
"messages": messages,
"temperature": 0.7,
"max_tokens": 500
}
for attempt in range(RETRIES):
try:
response = requests.post(API_URL, headers=headers, json=payload, timeout=266)
response.raise_for_status()
if response.status_code == 200:
return response.json()["choices"][0]["message"]["content"].strip()
except Exception as e:
if attempt < RETRIES - 1:
print(f"重试中 ({attempt + 1}/{RETRIES}): {str(e)}")
time.sleep(2 ** attempt) # 指数退避
else:
print(f"最终失败: {str(e)}")
return "生成失败"
def process_fragment(index: int, fragment: str) -> Dict:
"""处理单个片段"""
print(f"正在处理片段 {index + 1}...")
# 生成问题
question = generate_question_with_retry(fragment)
# 获取相邻片段
fragments = load_json_fragments(JSON_PATH)
prev_frag = fragments[index - 1] if index > 0 else "无前一篇段"
next_frag = fragments[index + 1] if index < len(fragments) - 1 else "无后一篇段"
return {
"问题(大模型生成)": question,
"对应搜索片段": fragment,
"对应搜索片段上一篇片段": prev_frag,
"对应搜索片段下一篇片段": next_frag
}
def save_results(results: List[Dict]):
"""保存结果到CSV"""
try:
with open(CSV_PATH, 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.DictWriter(f, fieldnames=[
"问题(大模型生成)",
"对应搜索片段",
"对应搜索片段上一篇片段",
"对应搜索片段下一篇片段"
])
writer.writeheader()
writer.writerows(results)
print(f"成功保存{len(results)}条记录到{CSV_PATH}")
except Exception as e:
print(f"保存CSV文件失败: {str(e)}")
def main():
# 加载数据
fragments = load_json_fragments(JSON_PATH)
total = len(fragments)
print(f"成功加载{total}个文本片段")
# 确定处理范围
start = START_INDEX
end = total
process_range = range(start, end)
# 创建线程池
results = []
with ThreadPoolExecutor(max_workers=THREADS) as executor:
# 提交任务
futures = {executor.submit(process_fragment, i, fragments[i]): i for i in process_range}
# 获取结果
for future in as_completed(futures):
try:
result = future.result()
results.append(result)
print(f"已完成 {len(results)}/{len(process_range)}")
except Exception as e:
print(f"处理失败: {str(e)}")
# 保存结果
save_results(results)
if __name__ == "__main__":
start_time = time.time()
main()
print(f"总耗时: {time.time() - start_time:.2f}秒")
try:
response = requests.post(API_URL, headers=headers, json=payload, timeout=266)
response.raise_for_status()
这一段的timeout是调整调用参数。像大型O/I交互需要使用线程池并发处理。