Spaces:
Sleeping
Sleeping
| import random | |
| from llm.llm import LLM | |
| from collections import Counter, defaultdict | |
| from prompt.template import DECOMPOSE_PRINCIPLE_PROMPT | |
| from utils.utils import read_json_file, write_json_file | |
| def read_problem_papers(problem_name): | |
| paper_dict = read_json_file('../data/paper_info_dataset.json')['data'] | |
| papers = [] | |
| for paper in paper_dict: | |
| if paper['paper'].startswith(problem_name): | |
| papers.append(paper['info']) | |
| return papers | |
| def generate_decompose_prompt(data): | |
| # llm = LLM('deepseek-reasoner') | |
| llm = LLM('chatgpt-4o-latest') | |
| # Step 1: Filter papers from 2015 and later | |
| filtered_papers = [paper for paper in data if paper['paper'].split('/')[0] >= '2014'] | |
| # Step 2: Group by type (A, B, C, D, E, F) | |
| problem_papers = defaultdict(list) | |
| # Loop through the filtered papers and organize by problem type | |
| for paper in filtered_papers: | |
| problem = paper['paper'].split('/')[0] | |
| problem_papers[problem].append(paper['info']) | |
| # keep 2 papers for each problem: | |
| for problem, papers in problem_papers.items(): | |
| if len(papers) > 3: | |
| problem_papers[problem] = random.sample(papers, 3) | |
| else: | |
| problem_papers[problem] = papers | |
| # Step 3: Group papers by problem type (second part of the problem identifier) | |
| problem_type_papers = defaultdict(list) | |
| for problem, papers in problem_papers.items(): | |
| problem_type = problem.split('_')[1] | |
| problem_type_papers[problem_type] += papers | |
| # Step 4: Group by tasknum (problem_type, len(tasks)) | |
| tasknum_papers = defaultdict(list) | |
| for problem_type, papers in problem_type_papers.items(): | |
| for paper in papers: | |
| tasknum_papers[(problem_type, len(paper['tasks']))].append(paper) | |
| filtered_tasknum_papers = tasknum_papers | |
| # # Step 5: Calculate the top 2 frequent tasknum for each problem_type | |
| # filtered_tasknum_papers = defaultdict(list) | |
| # for problem_type, papers in problem_type_papers.items(): | |
| # # Count the frequencies of tasknum within this problem_type | |
| # tasknum_counts = Counter(len(paper['tasks']) for paper in papers) | |
| # # Get the two most frequent tasknums | |
| # most_common_tasknums = [tasknum for tasknum, _ in tasknum_counts.most_common(3)] | |
| # print(problem_type, most_common_tasknums) | |
| # # Keep only the papers with the top 2 frequent tasknums | |
| # for paper in papers: | |
| # if len(paper['tasks']) in most_common_tasknums: | |
| # filtered_tasknum_papers[(problem_type, len(paper['tasks']))].append(paper) | |
| result = defaultdict(dict) | |
| for (problem_type, tasknum), papers in filtered_tasknum_papers.items(): | |
| if tasknum not in [3, 4, 5] or problem_type not in ['A', 'B', 'C', 'D', 'E', 'F']: | |
| continue | |
| # if tasknum not in [4] or problem_type not in ['C']: | |
| # continue | |
| print(f"Problem Type: {problem_type}, Task Number: {tasknum}, size: {len(papers)}") | |
| selected_papers = random.sample(papers, min(len(papers), 6)) | |
| examples = '---'.join(([task_decompose(paper) for paper in selected_papers])) | |
| prompt = DECOMPOSE_PRINCIPLE_PROMPT.format(examples=examples, tasknum=tasknum) | |
| answer = llm.generate(prompt) | |
| result[problem_type][int(tasknum)] = answer | |
| return result | |
| def task_decompose(paper): | |
| return '\n'.join([f"- Subtask {i}: {task['task_description'][:]}" for i, task in enumerate(paper['tasks'], start=1)]) | |
| if __name__ == "__main__": | |
| data = read_json_file('../data/actor_data/input/paper_info_dataset.json') | |
| result = generate_decompose_prompt(data['data']) | |
| write_json_file('../data/actor_data/input/decompose_prompt.json', result) | |