Skip to content

Commit

Permalink
improve LM
Browse files Browse the repository at this point in the history
  • Loading branch information
gfhdhytghd committed Aug 25, 2024
1 parent 69b4a61 commit 484ca22
Show file tree
Hide file tree
Showing 7 changed files with 391 additions and 27 deletions.
20 changes: 10 additions & 10 deletions SendQzone/processsend.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,14 @@ sendimagetoqqgroup() {
fi

find "$folder_path" -maxdepth 1 -type f | sort | while IFS= read -r file_path; do
echo "发送文件: $file_path"
msg=[CQ:image,file=file://$file_path]
encoded_msg=$(python3 -c "import urllib.parse; print(urllib.parse.quote('''$msg'''))")
# 构建 curl 命令,并发送编码后的消息
cmd="curl \"http://127.0.0.1:8083/send_group_msg?group_id=$groupid&message=$encoded_msg\""
echo $cmd
eval $cmd
sleep 1 # 添加延时以避免过于频繁的请求
echo "发送文件: $file_path"
msg=[CQ:image,file=file://$file_path]
encoded_msg=$(python3 -c "import urllib.parse; print(urllib.parse.quote('''$msg'''))")
# 构建 curl 命令,并发送编码后的消息
cmd="curl \"http://127.0.0.1:8083/send_group_msg?group_id=$groupid&message=$encoded_msg\""
echo $cmd
eval $cmd
sleep 1 # 添加延时以避免过于频繁的请求
done
echo "所有文件已发送"
}
Expand Down Expand Up @@ -260,15 +260,15 @@ processsend(){
echo process-json...
./getmsgserv/LM_work/progress-lite-json.sh ${id} ${numnext}
echo 'wait-for-LM...'
python3 ./getmsgserv/LM_work/sendtoLM.py ${numnext}
python3 ./getmsgserv/LM_work/sendtoLM-MTP.py ${numnext}
for i in {1..3}
do
if [ -f "./getmsgserv/post-step2/${numnext}.json" ]; then
echo "File exists, continuing..."
break
else
echo "File not found, running Python LM script..."
python3 ./getmsgserv/LM_work/sendtoLM.py "${numnext}"
python3 ./getmsgserv/LM_work/sendtoLM-MTP.py "${numnext}"
fi

if [ "$i" -eq 3 ] && [ ! -f "./getmsgserv/post-step2/${numnext}.json" ]; then
Expand Down
2 changes: 2 additions & 0 deletions getmsgserv/HTMLwork/gotohtml.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@
if (contentHeight <= pageHeight4in) {{
pageSize = '4in 4in'; // Use 4in x 4in if content fits
}} else if (contentHeight >= 2304){{
pageSize = '4in 24in'
}} else {{
const containerHeightInInches = (contentHeight / 96 + 0.1);
pageSize = `4in ${{containerHeightInInches}}in`; // Set height to container's height
Expand Down
13 changes: 10 additions & 3 deletions getmsgserv/HTMLwork/gotojpg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,19 @@ folder=./getmsgserv/post-step5/${input}
jsonfile=./getmsgserv/post-step2/${input}.json
rm -rf $folder
mkdir -p "$folder"
convert -density 720 -quality 100 ./getmsgserv/post-step4/${input}.pdf ./getmsgserv/post-step5/${input}/${input}.jpeg
# 使用identify获取PDF页数
pages=$(identify -format "%n\n" ./getmsgserv/post-step4/${input}.pdf | head -n 1)
# 循环处理每一页
for ((i=0; i<$pages; i++)); do
formatted_index=$(printf "%02d" $i)
convert -density 360 -quality 90 ./getmsgserv/post-step4/${input}.pdf[$i] $folder/${input}-${formatted_index}.jpeg
done
existing_files=$(ls "$folder" | wc -l)
next_file_index=$existing_files
jq -r '.messages[].message[] | select(.type=="image") | .data.url' "$jsonfile" | while read -r url; do
# 下载文件并命名
curl -o "$folder/$input-$next_file_index" "$url"
formatted_index=$(printf "%02d" $next_file_index)
curl -o "$folder/$input-${formatted_index}" "$url"
# 增加文件索引
next_file_index=$((next_file_index + 1))
done
Expand All @@ -24,4 +31,4 @@ for file in *.*; do
mv "$file" "$base_name"
fi
done
cd -
cd -
28 changes: 17 additions & 11 deletions getmsgserv/LM_work/progress-lite-json.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,23 @@ jq -c '.[] | select(.message != null) | .message[] | select(.type == "image")' "

# 下载文件并命名
local_file="$folder/$input-$next_file_index.png"
curl -o "$local_file" "$url"

# 使用 jq 替换 URL 为本地文件路径(file://$(pwd)/)
jq --arg old_url "$url" --arg new_url "file://$pwd_path/getmsgserv/post-step1/${input}/$(basename "$local_file")" \
'map(
if .message then
.message |= map(if .type == "image" and .data.url == $old_url then .data.url = $new_url else . end)
else
.
end
)' "$temp_json" > "$temp_json.tmp" && mv "$temp_json.tmp" "$temp_json"

# 检查文件是否已经存在
if [ -f "$local_file" ]; then
echo "文件 $local_file 已存在,跳过下载。"
else
curl -o "$local_file" "$url"

# 使用 jq 替换 URL 为本地文件路径(file://$(pwd)/)
jq --arg old_url "$url" --arg new_url "file://$pwd_path/getmsgserv/post-step1/${input}/$(basename "$local_file")" \
'map(
if .message then
.message |= map(if .type == "image" and .data.url == $old_url then .data.url = $new_url else . end)
else
.
end
)' "$temp_json" > "$temp_json.tmp" && mv "$temp_json.tmp" "$temp_json"
fi

# 增加文件索引
next_file_index=$((next_file_index + 1))
Expand Down
176 changes: 176 additions & 0 deletions getmsgserv/LM_work/sendtoLM-MTP-stream.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
import json
import time
import sys
import random
import dashscope
from http import HTTPStatus
from dashscope import Generation
from dashscope.api_entities.dashscope_response import Role
import re


def read_config(file_path):
config = {}
with open(file_path, 'r') as f:
for line in f:
key, value = line.strip().split('=')
config[key.strip()] = value.strip().strip('"')
return config


def fetch_response_in_parts(prompt, max_rounds=5):
messages = [{'role': 'system', 'content': '你是一个校园墙投稿管理员'},
{'role': 'user', 'content': prompt}]

full_response = ""
round_count = 0
is_complete = False

while not is_complete and round_count < max_rounds:
seed = random.randint(1, 10000)
print(f"Round {round_count + 1} - Using seed: {seed}")

# 使用流式输出方式调用生成模型
responses = Generation.call(
model='qwen2-72b-instruct',
messages=messages,
seed=seed,
result_format='message',
stream=True,
incremental_output=True,
max_tokens=6000,
temperature=0.50,
repetition_penalty=1.0
)

# 处理流式响应
output_content = ""
for response in responses:
if response.status_code == HTTPStatus.OK:
chunk = response.output.get('choices', [])[0].get('message', {}).get('content', '')
output_content += chunk
sys.stdout.write(chunk) # 实时打印每个chunk
sys.stdout.flush()
else:
print(f"Error in API call: {response.status_code}, {response.message}")
break

if round_count > 0:
# 只处理从第二轮开始的输出内容
start_index = output_content.find("```json")
if start_index != -1:
end_index = start_index
while end_index < len(output_content):
if '\u4e00' <= output_content[end_index] <= '\u9fff': # 检查是否有汉字
break
end_index += 1
output_content = output_content[:start_index] + output_content[end_index:]

full_response += output_content

# 检查响应是否包含结束标志
if output_content.endswith('```'):
is_complete = True
else:
messages.append({
'role': Role.ASSISTANT,
'content': output_content
})
messages.append({'role': Role.USER, 'content': '接着上次停下的地方继续输出,不要重复之前的内容,不要重复sender和needpriv等内容,不要在开头重复一遍```json {"time": },{"message": [{"type": ,"data": {,不要在开头重复任何格式内容,直接接着上次结束的那个字继续'})


round_count += 1

return full_response


def main():
config = read_config('oqqwall.config')
dashscope.api_key = config.get('apikey')

input_file = sys.argv[1]
output_file = sys.argv[1]

input_file_path = f'./getmsgserv/post-step1/{input_file}.json'
output_file_path = f'./getmsgserv/post-step2/{output_file}.json'

with open(input_file_path, 'r', encoding='utf-8') as infile:
data = json.load(infile)

cleaned_messages = []
fields_to_remove = ['message_id', 'file', 'subType', 'file_id', 'file_size']

for item in data.get('messages', []):
for field in fields_to_remove:
item.pop(field, None)
if 'message' in item and isinstance(item['message'], list):
for message in item['message']:
if 'data' in message and isinstance(message['data'], dict):
for field in fields_to_remove:
message['data'].pop(field, None)
cleaned_messages.append(item)

output_data = {
"sender": data.get("sender"),
"notregular": data.get("notregular"),
"messages": cleaned_messages
}

input_content = json.dumps(output_data, ensure_ascii=False, indent=4)
timenow = time.time()

prompt = (
"当前时间"f"{timenow}\n"
f"{input_content}\n\n"
"这是按照时间顺序排序的一组的校园墙投稿的聊天记录\n"
"将这里你认为需要放在同一稿件中,属于同一组事件的信息拆分出来,一组信息通常以在吗或者投稿或者墙之类的词语开始,但有时此类词语也会在中间才出现或者不出现,有时这些信息会包含image或者video,通常这些信息time会比较接近(大多数情况下time差距在600内,但也有例外),大部分情况下这整个记录里只包含一个稿件的内容(偶尔有例外),如果你认为这里只有一个稿件,那么所有内容都是一组中的,输出为json格式(需要```json开头和```结尾),只输出最后一组,不要输出任何额外内容\n\n"
"输出格式如下:\n"
"{\n"
" \"sender\": {\n"
" #直接抄写即可\n"
" \"user_id\": ,\n"
" \"nickname\": ,\n"
" \"sex\": \n"
" },\n"
" \"needpriv\": \"true\"/\"false\",\n"
" # 判断这条信息是否需要匿名,有时匿名意思会通过“匿”或者”码”的谐音字传达,有时也会通过“🐎”“🐴”之类的emojy传达\n"
" \"safemsg\": \"true\"/\"false\",\n"
" # 判断这条信息是否可以过审(是否含有攻击性信息或者政治信息)\n"
" \"isover\": \"true\"/\"false\",\n"
" # 判断他有没有说完,通常通过用语义来判断,检查记录中是否有“没发完”“发完了”一类的语句,判断已经发来的内容是否构成一个完整的稿件,也可以通过time判断,在最后一条消息的time距离timenow很久远的情况下可以判断为完整稿件,只有在非常肯定他发完了的情况下才为true\n"
" \"notregular\": \"true/false\",\n"
" # 直接抄写即可"
" \"messages\": [\n"
" # 接下来输出分好组的message信息\n"
" {\n"
" \"message\": [\n"
" {\n"
" \"type\": ,\n"
" \"data\": {\n"
" # 填写数据\n"
" }\n"
" }\n"
" ],\n"
" \"time\": \n"
" }\n"
" ]\n"
" \"why\": {\n"
" #在此填写你分段和填写各项目的依据与理由和原因\n"
" }\n"
"}"
)

# 使用流式传输获取模型响应
final_response = fetch_response_in_parts(prompt)

try:
formatted_data = json.loads(final_response.strip('```json\n').strip('\n```'))
with open(output_file_path, 'w', encoding='utf-8') as outfile:
json.dump(formatted_data, outfile, ensure_ascii=False, indent=4)
print("处理完成,输出已保存到:", output_file_path)
except json.JSONDecodeError as e:
print(f"JSON解析错误: {e}\n返回内容: {final_response}")


if __name__ == '__main__':
main()
Loading

0 comments on commit 484ca22

Please sign in to comment.