From 5df0bdd176011a2d013d6f43a55a36294ea6997a Mon Sep 17 00:00:00 2001 From: GyeongSik Son <47748085+Son-GyeongSik@users.noreply.github.com> Date: Thu, 9 Nov 2023 19:25:21 +0900 Subject: [PATCH 1/8] [SWM-405] Feat : divide git action (test, main) --- .../workflows/{main.yml => deploy-main.yml} | 21 +---------- .github/workflows/deploy-test.yml | 35 +++++++++++++++++++ 2 files changed, 36 insertions(+), 20 deletions(-) rename .github/workflows/{main.yml => deploy-main.yml} (61%) create mode 100644 .github/workflows/deploy-test.yml diff --git a/.github/workflows/main.yml b/.github/workflows/deploy-main.yml similarity index 61% rename from .github/workflows/main.yml rename to .github/workflows/deploy-main.yml index dbc78b9..0f8a56e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/deploy-main.yml @@ -1,4 +1,4 @@ -name: Sroom-AI-Deploy +name: Sroom-AI-Deploy-Main on: workflow_dispatch: @@ -33,22 +33,3 @@ jobs: tmux send-keys -t server "python3 main.py server" C-m tmux send-keys -t celery "celery -A celery_app worker --concurrency=10 -l info" C-m - - - name: Deploy Test Server - uses: appleboy/ssh-action@v0.1.6 - with: - host: ${{ secrets.AWS_SSH_TEST_HOST }} - username: ubuntu - key: ${{ secrets.SSH_SECRET_KEY }} - script_stop: true - script: | - cd sroom-ai/ - git pull - pip3 install -r requirements.txt - - tmux send-keys -t celery "^C" C-m - tmux send-keys -t server "^C" C-m - - tmux send-keys -t server "python3 main.py server" C-m - tmux send-keys -t celery "celery -A celery_app worker --concurrency=2 -l info" C-m - diff --git a/.github/workflows/deploy-test.yml b/.github/workflows/deploy-test.yml new file mode 100644 index 0000000..411b083 --- /dev/null +++ b/.github/workflows/deploy-test.yml @@ -0,0 +1,35 @@ +name: Sroom-AI-Deploy-Test + +on: + workflow_dispatch: + pull_request: + branches: + - main +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY }} + aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY }} + aws-region: ap-northeast-2 + + - name: Deploy Test Server + uses: appleboy/ssh-action@v0.1.6 + with: + host: ${{ secrets.AWS_SSH_TEST_HOST }} + username: ubuntu + key: ${{ secrets.SSH_SECRET_KEY }} + script_stop: true + script: | + cd sroom-ai/ + git pull + pip3 install -r requirements.txt + + tmux send-keys -t celery "^C" C-m + tmux send-keys -t server "^C" C-m + + tmux send-keys -t server "python3 main.py server" C-m + tmux send-keys -t celery "celery -A celery_app worker --concurrency=2 -l info" C-m From 956ab72f46cc2c8c76c26162a3d6677e851b6c6d Mon Sep 17 00:00:00 2001 From: GyeongSik Son <47748085+Son-GyeongSik@users.noreply.github.com> Date: Thu, 9 Nov 2023 19:25:46 +0900 Subject: [PATCH 2/8] [SWM-405] Feat : develop summary fragmentation logic --- app/index.py | 10 ++++--- app/quiz/quiz.py | 40 ++++++++++---------------- app/quiz/quizv2.py | 30 +++++++++++-------- app/summary/summaryv2.py | 62 ++++++++++++++++++++++++++++++++++++++++ constants.yaml | 33 +++++++-------------- 5 files changed, 111 insertions(+), 64 deletions(-) create mode 100644 app/summary/summaryv2.py diff --git a/app/index.py b/app/index.py index 8df7aca..b123209 100644 --- a/app/index.py +++ b/app/index.py @@ -4,7 +4,9 @@ from app.script import script, scriptService from main import constants from app.summary import summary -from app.quiz import quiz, quizv2 +from app.summary import summaryv2 +from app.quiz import quiz +from app.quiz import quizv2 class ResponseModel: @@ -21,7 +23,7 @@ def to_dict(self): 'is_valid': self.is_valid, 'summary': self.summary, 'quizzes': self.quizzes, - 'tokens' : self.tokens + 'tokens': self.tokens } @@ -38,8 +40,8 @@ def index(video_id: str = '', video_title: str = ''): if youtube_script.is_valid: response.is_valid = 1 - summary_result = loop.run_until_complete(summary.generate_summary(youtube_script.text, video_title)) - quizzes_result = loop.run_until_complete(quizv2.generate_quizzes(summary_result, youtube_script.token_count)) + summary_result, summaries = loop.run_until_complete(summaryv2.generate_summary(youtube_script.raw_script, video_title)) + quizzes_result = loop.run_until_complete(quizv2.generate_quizzes(summaries)) response.summary = summary_result response.quizzes = quizzes_result diff --git a/app/quiz/quiz.py b/app/quiz/quiz.py index 568fd37..5fc3ad3 100644 --- a/app/quiz/quiz.py +++ b/app/quiz/quiz.py @@ -1,44 +1,34 @@ import json -from app.gpt import gpt from main import constants +from app.gpt import gpt MAX_TRY_COUNT = 3 -async def generate_quiz(summary: str): - - quiz_prompt = constants['prompt']['quiz']['kr'] - prompt = summary + quiz_prompt +async def generate_quizzes(summary: str, script_tokens: int): + quiz_count = set_quiz_count(script_tokens) + prompt = summary + constants['prompt']['multiple_choice_quiz']['kr'] + str(quiz_count) quiz_json = {} + system_message = constants['prompt']['multiple_choice_quiz']['system_message'] for count in range(MAX_TRY_COUNT): - gpt_response = await gpt.request_gpt(prompt) + gpt_response = await gpt.request_gpt(prompt, system_message) quiz_json, is_valid = _reformat_quiz(gpt_response) if is_valid: break - quizzes = [] - for quiz in quiz_json['quizzes']: - # quiz = await validation_quiz(json.dumps(quiz)) - quizzes.append(quiz) - - return quizzes - - -async def validation_quiz(raw_quiz: str): + return quiz_json - quiz_validation_prompt = constants['prompt']['quiz_validation']['kr'] - prompt = raw_quiz + quiz_validation_prompt - quiz_json = {} - for count in range(MAX_TRY_COUNT): - gpt_response = await gpt.request_gpt(prompt) - quiz_json, is_valid = _reformat_quiz(gpt_response) - if is_valid: - break +def set_quiz_count(script_tokens: int): + quiz_count = 3 + if script_tokens > 5000: + quiz_count += int((script_tokens - 5000) / 2500) + 1 + if quiz_count > 15: + quiz_count = 15 - return quiz_json + return quiz_count def _reformat_quiz(quiz_json: str): @@ -49,6 +39,6 @@ def _reformat_quiz(quiz_json: str): quiz_json = json.loads(quiz_json) except json.decoder.JSONDecodeError as e: print("JSON Decode Error : retry generate quiz") - return {'quizzes': []}, False + return [{'quiz_type': 1, 'quiz_question': 'ERROR!', 'quiz_select_options': ['퀴즈 생성중 오류가 발생했습니다. ㅠㅠ'], 'answer': 1}], False return quiz_json, True diff --git a/app/quiz/quizv2.py b/app/quiz/quizv2.py index 5fc3ad3..5dbcb59 100644 --- a/app/quiz/quizv2.py +++ b/app/quiz/quizv2.py @@ -1,3 +1,4 @@ +import asyncio import json from main import constants @@ -6,8 +7,23 @@ MAX_TRY_COUNT = 3 -async def generate_quizzes(summary: str, script_tokens: int): - quiz_count = set_quiz_count(script_tokens) +async def generate_quizzes(summaries: list): + + if len(summaries) == 1: + quiz_count = 3 + else: + quiz_count = 2 + + tasks = [generate_quizzes_chunk(summary, quiz_count) for summary in summaries] + quiz_chunk_list = await asyncio.gather(*tasks) + + quiz_list = [] + for quiz_chunk in quiz_chunk_list: + quiz_list.extend(quiz_chunk) + return quiz_list + + +async def generate_quizzes_chunk(summary: str, quiz_count: int): prompt = summary + constants['prompt']['multiple_choice_quiz']['kr'] + str(quiz_count) quiz_json = {} system_message = constants['prompt']['multiple_choice_quiz']['system_message'] @@ -21,16 +37,6 @@ async def generate_quizzes(summary: str, script_tokens: int): return quiz_json -def set_quiz_count(script_tokens: int): - quiz_count = 3 - if script_tokens > 5000: - quiz_count += int((script_tokens - 5000) / 2500) + 1 - if quiz_count > 15: - quiz_count = 15 - - return quiz_count - - def _reformat_quiz(quiz_json: str): quiz_json = quiz_json.replace("\n", "") quiz_json = quiz_json.replace("\"", '"') diff --git a/app/summary/summaryv2.py b/app/summary/summaryv2.py new file mode 100644 index 0000000..23d95f9 --- /dev/null +++ b/app/summary/summaryv2.py @@ -0,0 +1,62 @@ +import asyncio +import datetime +import re + +from main import constants +from app.gpt import gpt + + +async def generate_summary(scripts: dict, video_title: str): + time_stamp, chunks = divide_chunk(scripts) + summary_prompt = constants['prompt']['final_summary']['kr'] + + tasks = [gpt.request_gpt(summary_prompt + "\n script : " + chunk, + constants['prompt']['final_summary']['system_message']) for idx, chunk in enumerate(chunks)] + + summaries = await asyncio.gather(*tasks) + + final_summary = '' + for idx, summary in enumerate(summaries): + time_delta = datetime.timedelta(seconds=int(time_stamp[idx])) + time_format = str(time_delta) + final_summary += '### ' + time_format + '' + '\n' + final_summary += summary + '\n \n \n' + + final_summary = reformat_summary(final_summary) + return final_summary, summaries + + +def divide_chunk(scripts: dict): + + chunk_text = '' + time_stamp = 0 + + time_stamps = [] + chunks = [] + for script in scripts: + if len(chunk_text) > 3000: + chunk_text.replace("[음악]", "") + chunk_text.replace("[박수]", "") + chunks.append(chunk_text) + time_stamps.append(time_stamp) + time_stamp = script['start'] + chunk_text = script['text'] + ' ' + else: + chunk_text += script['text'] + + if len(chunk_text) < 1000: + chunks[-1] += chunk_text + else: + time_stamps.append(time_stamp) + chunks.append(chunk_text) + + print(time_stamps) + print(chunks) + + return time_stamps, chunks + + +def reformat_summary(summary: str): + summary.replace("\#", "#") + summary = re.sub(r"```", "", summary) + return summary diff --git a/constants.yaml b/constants.yaml index 9b9329a..4992c16 100644 --- a/constants.yaml +++ b/constants.yaml @@ -24,33 +24,20 @@ model_parameter : prompt : system_message : {"role": "system", "content": "You are an assistant that generates quizzes and summaries"} final_summary: - system_message : {"role": "system", "content": "write the given text in the given markdown format, where applicable : ## {Title} \n ### {section name} \n {content}"} - en : "\n\n Please summarize the article in make sure to write the summary in Korean. Please write in Korean but English terms in English. And please make the summary in the form of a markdown raw code with escape sequence. And please exclude inline codes such as ```." - kr : "\n\n 위 글을 한국어로 마크다운 형식으로 만들어줘. 코드블록은 빼고 작성해줘.\n - Heading2로 제목을 달아주고, 각 키워드나 핵심 내용들을 나눠서 작성해줘.\n\n" + system_message : {"role": "system", "content": "write the given text in the given markdown format, where applicable : ### {section name} \n {content}"} + en : "\n\n Please summarize the article in make sure to write the summary in Korean. Please write in Korean but English terms in English. And please make the summary in the form of a markdown raw code with escape sequence." + kr : "\n\n 아래는 유튜브 스크립트야. 해당 스크립트를 한국어로 마크다운 형식으로 요약해줘.\n + 키워드나 핵심 내용들을 나눠서 아래 예시 처럼 작성해줘.\n\n + 예시 : \n + ### 6.25전쟁 발발 이유 \n + - 미국이 애치슨 라인을 발표하였고 한국이 해당 영역에 들지 못했다. \n + - 소련이 북한에 군사적 지원을 했다. + " + summary : system_message: { "role": "system", "content": "summarize the given text in korean so that it contains everything information as much as possible." } en : "\n\n Please summarize the above script so that everything is reflected as much as possible. Please write a summary as if the student is writing down the contents of the class in a notebook. Please make the summary in Korean." kr : "\n\n 아래는 유튜브 영상 제목과 그 영상의 스크립트야. 제목과 스크립트 내용을 바탕으로 요약을 진행해줘. 최대한 디테일하게 내용을 담아줘. 요약은 한국어로 진행해줘." - quiz : - en : "\n\n Based on the summary above, please give me 1 multiple choice question, 1 short answer question, and 1 True or False question. \n - Please answer in JSON format and follow the format below. \n - quizzes: [{\"quiz_type\":\"\" ,\"quiz_question\": \"\" \"quiz_select_options\": [], \"answer\":\"\"}] \n - quiz_type is assigned to 1 for multiple choice, 2 for short answer, and 3 for True or False \n - quiz_question allows questions to go in\" - Please put multiple choice options in the quiz_select_option list. The number of optional list elements must be 5. In multiple choice, answer is to put a number in the option list with the correct answer among the options (starting from 1)\n - For short answer and TF problems, please return the option list as an empty list. Please unify the blank list format to []\n - Please return the TF question to 1 if true and 0 if false. All answer must be filled out. If the answer is too long, please give me a short answer. Make sure to give one question for each type. Please write the quiz questions, answers, and options in Korean" - kr : "\n\n 위 요약본을 바탕으로 퀴즈를 만들어줘. 퀴즈는 객관식, 주관식, true or false 문제 각각 1문제씩 만들어줘.\n - 퀴즈는 json 형식으로 만들어주고, 구체적인 형식은 아래와 같아. \n - quizzes: [{\"quiz_type\":\"\" ,\"quiz_question\": \"\" \"quiz_select_options\": [], \"answer\":\"\"}] \n - quiz_type은 객관식은 1, 주관식은 2, true or false 문제는 3으로 할당해줘.\n - quiz_question에는 퀴즈 문제가 들어가게 해줘.\n - quiz_select_options에는 객관식에서 선택할 수 있는 선택 옵션들이 string형식으로 들어간 리스트로 만들어줘. 주관식, true or false 같은 경우에는 빈 리스트인 [] 형식으로 반환해줘.\n - answer는 객관식의 경우에는 quiz_select_options에서 정답인 요소가 들어간 index번호를 주면 되 (인덱스 시작번호는 1이야).\n - 주관식은 주관식의 답이 string형식으로 들어가고 리스트와 같은 다른 형식은 허용되지 않아. - true or false 문제는 답이 true일 경우 1, 거짓일 경우 0이 들어가면되. - 모든 answer항목은 반드시 답이 채워져 있어야해. 빈칸이면 안되." multiple_choice_quiz: system_message: { "role": "system", "content": "Make a multiple choice quiz with the given text" } From 3527a997089737c0c2bb79e6dde46fa7342a335e Mon Sep 17 00:00:00 2001 From: GyeongSik Son <47748085+Son-GyeongSik@users.noreply.github.com> Date: Thu, 9 Nov 2023 19:39:38 +0900 Subject: [PATCH 3/8] [SWM-405] Feat : modify test CICD --- .github/workflows/deploy-test.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/deploy-test.yml b/.github/workflows/deploy-test.yml index 411b083..7035e39 100644 --- a/.github/workflows/deploy-test.yml +++ b/.github/workflows/deploy-test.yml @@ -15,7 +15,7 @@ jobs: aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY }} aws-secret-access-key: ${{ secrets.AWS_SECRET_KEY }} aws-region: ap-northeast-2 - + - name: Deploy Test Server uses: appleboy/ssh-action@v0.1.6 with: @@ -25,7 +25,11 @@ jobs: script_stop: true script: | cd sroom-ai/ - git pull + + git fetch + git checkout ${{ github.event.pull_request.head.ref }} + git pull origin ${{ github.event.pull_request.head.ref }} + pip3 install -r requirements.txt tmux send-keys -t celery "^C" C-m From c69311ec0b41744bace603d53e042d7eb79b7861 Mon Sep 17 00:00:00 2001 From: GyeongSik Son <47748085+Son-GyeongSik@users.noreply.github.com> Date: Thu, 9 Nov 2023 19:41:47 +0900 Subject: [PATCH 4/8] [SWM-405] Feat : modify test CICD --- .github/workflows/deploy-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy-test.yml b/.github/workflows/deploy-test.yml index 7035e39..40e5857 100644 --- a/.github/workflows/deploy-test.yml +++ b/.github/workflows/deploy-test.yml @@ -28,7 +28,7 @@ jobs: git fetch git checkout ${{ github.event.pull_request.head.ref }} - git pull origin ${{ github.event.pull_request.head.ref }} + git pull pip3 install -r requirements.txt From fc750d455483889aa5f9f2c63a455f9aeeafa0df Mon Sep 17 00:00:00 2001 From: GyeongSik Son <47748085+Son-GyeongSik@users.noreply.github.com> Date: Thu, 9 Nov 2023 19:43:17 +0900 Subject: [PATCH 5/8] [SWM-405] Fix : fix test CICD escape error --- .github/workflows/deploy-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/deploy-test.yml b/.github/workflows/deploy-test.yml index 40e5857..0e99685 100644 --- a/.github/workflows/deploy-test.yml +++ b/.github/workflows/deploy-test.yml @@ -27,7 +27,7 @@ jobs: cd sroom-ai/ git fetch - git checkout ${{ github.event.pull_request.head.ref }} + git checkout '${{ github.event.pull_request.head.ref }}' git pull pip3 install -r requirements.txt From 94741e286491916ffd6399dd4cb2810068c3107b Mon Sep 17 00:00:00 2001 From: GyeongSik Son <47748085+Son-GyeongSik@users.noreply.github.com> Date: Fri, 10 Nov 2023 16:50:49 +0900 Subject: [PATCH 6/8] [SWM-405] Feat : modify timestamp design --- app/summary/summaryv2.py | 11 ++++++----- constants.yaml | 8 +------- 2 files changed, 7 insertions(+), 12 deletions(-) diff --git a/app/summary/summaryv2.py b/app/summary/summaryv2.py index 23d95f9..45bfb05 100644 --- a/app/summary/summaryv2.py +++ b/app/summary/summaryv2.py @@ -19,8 +19,12 @@ async def generate_summary(scripts: dict, video_title: str): for idx, summary in enumerate(summaries): time_delta = datetime.timedelta(seconds=int(time_stamp[idx])) time_format = str(time_delta) - final_summary += '### ' + time_format + '' + '\n' - final_summary += summary + '\n \n \n' + final_summary += ('' + + time_format + '' + '\n') + final_summary += summary + '\n \n ' final_summary = reformat_summary(final_summary) return final_summary, summaries @@ -50,9 +54,6 @@ def divide_chunk(scripts: dict): time_stamps.append(time_stamp) chunks.append(chunk_text) - print(time_stamps) - print(chunks) - return time_stamps, chunks diff --git a/constants.yaml b/constants.yaml index 4992c16..6e8e1fb 100644 --- a/constants.yaml +++ b/constants.yaml @@ -26,13 +26,7 @@ prompt : final_summary: system_message : {"role": "system", "content": "write the given text in the given markdown format, where applicable : ### {section name} \n {content}"} en : "\n\n Please summarize the article in make sure to write the summary in Korean. Please write in Korean but English terms in English. And please make the summary in the form of a markdown raw code with escape sequence." - kr : "\n\n 아래는 유튜브 스크립트야. 해당 스크립트를 한국어로 마크다운 형식으로 요약해줘.\n - 키워드나 핵심 내용들을 나눠서 아래 예시 처럼 작성해줘.\n\n - 예시 : \n - ### 6.25전쟁 발발 이유 \n - - 미국이 애치슨 라인을 발표하였고 한국이 해당 영역에 들지 못했다. \n - - 소련이 북한에 군사적 지원을 했다. - " + kr : "\n\n 아래는 유튜브 스크립트야. 해당 스크립트를 한국어로 마크다운 형식으로 요약해줘. 키워드나 핵심 내용들을 나눠서 작성해줘." summary : system_message: { "role": "system", "content": "summarize the given text in korean so that it contains everything information as much as possible." } From bd51f6baf7444f6d7bdaff7069ddfba7133ffbff Mon Sep 17 00:00:00 2001 From: GyeongSik Son <47748085+Son-GyeongSik@users.noreply.github.com> Date: Fri, 10 Nov 2023 17:00:53 +0900 Subject: [PATCH 7/8] [SWM-405] Feat : modify href format --- app/summary/summaryv2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/summary/summaryv2.py b/app/summary/summaryv2.py index 45bfb05..e4140ca 100644 --- a/app/summary/summaryv2.py +++ b/app/summary/summaryv2.py @@ -19,7 +19,7 @@ async def generate_summary(scripts: dict, video_title: str): for idx, summary in enumerate(summaries): time_delta = datetime.timedelta(seconds=int(time_stamp[idx])) time_format = str(time_delta) - final_summary += ('' + From 63b571f786f874fe669e4d8c9b6066eb9d5a7fa7 Mon Sep 17 00:00:00 2001 From: GyeongSik Son <47748085+Son-GyeongSik@users.noreply.github.com> Date: Fri, 10 Nov 2023 18:43:27 +0900 Subject: [PATCH 8/8] [SWM-405] Feat : modify timestamp tag (a tag -> button) --- app/summary/summaryv2.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/app/summary/summaryv2.py b/app/summary/summaryv2.py index e4140ca..ba40863 100644 --- a/app/summary/summaryv2.py +++ b/app/summary/summaryv2.py @@ -19,11 +19,10 @@ async def generate_summary(scripts: dict, video_title: str): for idx, summary in enumerate(summaries): time_delta = datetime.timedelta(seconds=int(time_stamp[idx])) time_format = str(time_delta) - final_summary += ('' + - time_format + '' + '\n') + final_summary += ('' + '\n') final_summary += summary + '\n \n ' final_summary = reformat_summary(final_summary)