|
import gradio as gr |
|
import openai |
|
import fitz |
|
import os |
|
import tempfile |
|
import time |
|
import logging |
|
import re |
|
from typing import List, Optional, Dict, Any, Union |
|
import concurrent.futures |
|
|
|
|
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
api_key = "" |
|
|
|
|
|
def set_api_key(key: str) -> str: |
|
"""Set the OpenAI API key.""" |
|
global api_key |
|
if not key.strip(): |
|
return "Please enter a valid API key" |
|
|
|
api_key = key.strip() |
|
return "β
API Key Set Successfully!" |
|
|
|
|
|
def extract_text_from_pdf(pdf_path: str) -> str: |
|
"""Extract text content from a PDF file.""" |
|
try: |
|
doc = fitz.open(pdf_path) |
|
text = "" |
|
for page_num, page in enumerate(doc): |
|
text += f"\n--- Page {page_num + 1} ---\n" |
|
text += page.get_text("text") |
|
return text |
|
except Exception as e: |
|
logger.error(f"Error extracting text from PDF: {str(e)}") |
|
return f"Error extracting text from PDF: {str(e)}" |
|
|
|
|
|
def truncate_text_for_tokens(text: str, max_tokens: int = 8000) -> str: |
|
"""Truncate text to approximately fit within token limits.""" |
|
|
|
char_limit = max_tokens * 4 |
|
if len(text) > char_limit: |
|
return text[:char_limit] + "\n[Content truncated due to length...]" |
|
return text |
|
|
|
|
|
def extract_title(pdf_text: str) -> str: |
|
"""Attempt to extract a title from PDF text.""" |
|
|
|
first_lines = pdf_text.split('\n')[:10] |
|
for line in first_lines: |
|
line = line.strip() |
|
|
|
if len(line) > 5 and len(line) < 200 and not line.startswith('---'): |
|
return line |
|
|
|
return "Untitled Document" |
|
|
|
|
|
MODEL_OPTIONS = { |
|
"gpt-4.1": "GPT-4 (Most powerful, slower)", |
|
"gpt-3.5-turbo": "GPT-3.5 Turbo (Faster, less powerful)" |
|
} |
|
|
|
|
|
def get_available_models() -> List[str]: |
|
"""Get list of available OpenAI models.""" |
|
if not api_key: |
|
return list(MODEL_OPTIONS.keys()) |
|
|
|
try: |
|
openai.api_key = api_key |
|
response = openai.Model.list() |
|
models = [model.id for model in response['data'] if 'gpt' in model.id.lower()] |
|
|
|
for model in models: |
|
if model not in MODEL_OPTIONS and ('gpt-4.1' in model or 'gpt-3.5-turbo' in model): |
|
MODEL_OPTIONS[model] = model |
|
return list(MODEL_OPTIONS.keys()) |
|
except Exception as e: |
|
logger.error(f"Error fetching models: {str(e)}") |
|
return list(MODEL_OPTIONS.keys()) |
|
|
|
|
|
def process_pdf_in_parallel(pdf_files: List[str]) -> List[tuple]: |
|
"""Process multiple PDFs in parallel to extract text.""" |
|
results = [] |
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor: |
|
future_to_pdf = {executor.submit(extract_text_from_pdf, pdf_path): pdf_path for pdf_path in pdf_files} |
|
for future in concurrent.futures.as_completed(future_to_pdf): |
|
pdf_path = future_to_pdf[future] |
|
pdf_name = os.path.basename(pdf_path) |
|
try: |
|
pdf_text = future.result() |
|
|
|
pdf_text = truncate_text_for_tokens(pdf_text) |
|
results.append((pdf_name, pdf_text)) |
|
except Exception as e: |
|
logger.error(f"Error processing {pdf_name}: {str(e)}") |
|
results.append((pdf_name, f"Error processing file: {str(e)}")) |
|
|
|
return results |
|
|
|
|
|
def create_system_prompt(review_type: str = "systematic") -> str: |
|
"""Create system prompt based on review type.""" |
|
if review_type == "systematic": |
|
return """ |
|
You are an expert academic researcher tasked with creating comprehensive systematic reviews. Follow these steps: |
|
|
|
Step 1: Identify a Research Field |
|
Identify the specific area of study represented in the provided papers. |
|
|
|
Step 2: Generate a Research Question |
|
Create a specific, measurable, achievable, relevant, and time-bound (SMART) research question that unifies the papers. |
|
|
|
Step 3: Create a Protocol |
|
Outline a detailed methodology for your review, including analysis methods appropriate for the papers. |
|
|
|
Step 4: Evaluate Relevant Literature |
|
Critically evaluate the quality, methodology, and findings of the provided papers, identifying gaps or limitations. |
|
|
|
Step 5: Investigate Sources for Answers |
|
Examine how the papers contribute to answering the research question. |
|
|
|
Step 6: Collect Data as per Protocol |
|
Implement rigorous data collection methods, extracting key findings and statistics. |
|
|
|
Step 7: Data Extraction |
|
Organize the extracted data in a structured format, including tables where appropriate. |
|
|
|
Step 8: Critical Analysis of Results |
|
Interpret patterns, trends, and conclusions from the data, comparing findings across papers. |
|
|
|
Step 9: Interpreting Derivations |
|
Contextualize the findings in relation to the research question and broader field. |
|
|
|
Step 10: Concluding Statements |
|
Summarize findings, draw conclusions, and provide recommendations for future research. |
|
|
|
Step 11: References |
|
Include proper citations for all papers reviewed and any additional references. |
|
|
|
Your review should be: |
|
- Comprehensive yet concise |
|
- Well-structured with clear headings and subheadings |
|
- Using academic language appropriate for a scholarly audience |
|
- Including data visualizations or tables where helpful |
|
- Balanced and objective in evaluating the evidence |
|
""" |
|
elif review_type == "literature": |
|
return """ |
|
You are an expert academic researcher tasked with creating a thorough literature review. Your review should: |
|
|
|
1. Provide an overview of the current state of knowledge in the specific field |
|
2. Identify common themes, methodologies, and findings across the papers |
|
3. Highlight contradictions or inconsistencies in the literature |
|
4. Evaluate the strength of evidence for key claims |
|
5. Identify research gaps and future directions |
|
6. Organize findings in a logical, thematic structure |
|
7. Include visual elements (tables, concept maps) to synthesize information |
|
8. Maintain academic rigor and proper attribution |
|
|
|
Your review should be scholarly in tone, well-organized, and provide a balanced assessment of the literature. |
|
""" |
|
else: |
|
return """ |
|
You are an expert researcher conducting a meta-analysis of the provided papers. Your analysis should: |
|
|
|
1. Identify a precise research question that can be answered quantitatively |
|
2. Extract comparable quantitative data, effect sizes, or statistics from the papers |
|
3. Assess the methodological quality and risk of bias in each study |
|
4. Synthesize findings using appropriate statistical methods |
|
5. Present results using forest plots, funnel plots, or other visualizations |
|
6. Discuss heterogeneity and its potential sources |
|
7. Evaluate publication bias and its impact on the findings |
|
8. Draw conclusions based on the pooled data |
|
9. Discuss implications for practice and future research |
|
|
|
Your meta-analysis should follow PRISMA guidelines where applicable, maintain statistical rigor, and provide clear visual representations of the quantitative synthesis. |
|
""" |
|
|
|
|
|
def generate_systematic_review( |
|
pdf_files: List[str], |
|
review_question: str, |
|
model: str = "gpt-4.1", |
|
review_type: str = "systematic", |
|
include_tables: bool = True, |
|
temperature: float = 0.7, |
|
max_tokens: int = 4000 |
|
) -> str: |
|
"""Generate a systematic review of the provided PDF files.""" |
|
if not api_key: |
|
return "Please enter your OpenAI API key first." |
|
|
|
if not pdf_files: |
|
return "Please upload at least one PDF file." |
|
|
|
if not review_question: |
|
return "Please enter a review question." |
|
|
|
try: |
|
|
|
start_time = time.time() |
|
|
|
openai.api_key = api_key |
|
|
|
|
|
system_prompt = create_system_prompt(review_type) |
|
|
|
|
|
logger.info(f"Processing {len(pdf_files)} PDFs...") |
|
pdf_results = process_pdf_in_parallel(pdf_files) |
|
|
|
|
|
titles = [extract_title(pdf_text) for _, pdf_text in pdf_results] |
|
pdf_names = [name for name, _ in pdf_results] |
|
|
|
|
|
table_instruction = "" |
|
if include_tables: |
|
table_instruction = " Please include important tables, charts or figures in your review to help summarize the findings." |
|
|
|
user_prompt = f""" |
|
Please generate a {review_type} review of the following {len(pdf_files)} papers: |
|
{', '.join([f"{i+1}. {pdf_names[i]} (Title: {titles[i]})" for i in range(len(pdf_names))])} |
|
|
|
Review Question: {review_question} |
|
|
|
{table_instruction} |
|
|
|
Format your response with clear headings, subheadings, and properly formatted tables using markdown syntax. |
|
""" |
|
|
|
|
|
combined_pdf_text = "" |
|
total_chars = 0 |
|
max_chars = 20000 |
|
|
|
for i, (pdf_name, pdf_text) in enumerate(pdf_results): |
|
header = f"\n\n--- PAPER {i+1}: {pdf_name} ---\n\n" |
|
if total_chars + len(header) + len(pdf_text) > max_chars: |
|
|
|
remaining = max_chars - total_chars - len(header) |
|
if remaining > 500: |
|
truncated_text = pdf_text[:remaining] + "\n[... Content truncated due to length limitations ...]" |
|
combined_pdf_text += header + truncated_text |
|
total_chars += len(header) + len(truncated_text) |
|
break |
|
else: |
|
combined_pdf_text += header + pdf_text |
|
total_chars += len(header) + len(pdf_text) |
|
|
|
|
|
messages = [ |
|
{"role": "system", "content": system_prompt}, |
|
{"role": "user", "content": user_prompt + combined_pdf_text} |
|
] |
|
|
|
logger.info(f"Sending request to OpenAI API (model: {model})...") |
|
|
|
|
|
response = openai.ChatCompletion.create( |
|
model=model, |
|
messages=messages, |
|
temperature=temperature, |
|
max_tokens=max_tokens |
|
) |
|
|
|
result = response["choices"][0]["message"]["content"] |
|
|
|
|
|
result_html = markdown.markdown(result, extensions=['tables']) |
|
|
|
|
|
time_taken = time.time() - start_time |
|
logger.info(f"Review generated in {time_taken:.2f} seconds") |
|
|
|
return result |
|
|
|
except Exception as e: |
|
logger.error(f"Error generating review: {str(e)}") |
|
return f"Error generating systematic review: {str(e)}" |
|
|
|
|
|
def save_uploaded_files(files) -> List[str]: |
|
"""Save uploaded files to temporary directory and return their paths.""" |
|
if not files: |
|
return [] |
|
|
|
saved_paths = [] |
|
for file in files: |
|
if file is not None: |
|
|
|
file_extension = os.path.splitext(file.name)[1].lower() |
|
|
|
|
|
if file_extension != '.pdf': |
|
continue |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as tmp_file: |
|
|
|
if hasattr(file, 'read'): |
|
tmp_file.write(file.read()) |
|
|
|
else: |
|
with open(file, 'rb') as f: |
|
tmp_file.write(f.read()) |
|
|
|
saved_paths.append(tmp_file.name) |
|
|
|
return saved_paths |
|
|
|
|
|
css = """ |
|
<style> |
|
/* Base styling */ |
|
body { |
|
font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif; |
|
} |
|
|
|
.container { |
|
max-width: 1200px !important; |
|
margin: 0 auto; |
|
} |
|
|
|
/* Header styling */ |
|
.header { |
|
background: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%); |
|
color: white; |
|
padding: 20px; |
|
border-radius: 10px; |
|
margin-bottom: 20px; |
|
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); |
|
} |
|
|
|
/* Button styling */ |
|
#generate_button { |
|
background: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%); /* Purple gradient */ |
|
color: white; |
|
font-weight: bold; |
|
padding: 10px 20px; |
|
border-radius: 8px; |
|
border: none; |
|
cursor: pointer; |
|
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2); |
|
transition: all 0.3s ease; |
|
} |
|
|
|
#generate_button:hover { |
|
background: linear-gradient(135deg, #5b10f1 0%, #9f3ef3 100%); /* Slightly lighter */ |
|
transform: translateY(-2px); |
|
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2); |
|
} |
|
|
|
#api_key_button { |
|
background: linear-gradient(135deg, #68d391 0%, #48bb78 100%); /* Green gradient */ |
|
color: white; |
|
font-weight: bold; |
|
margin-top: 27px; |
|
padding: 10px 20px; |
|
border-radius: 8px; |
|
border: none; |
|
cursor: pointer; |
|
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.2); |
|
transition: all 0.3s ease; |
|
} |
|
|
|
#api_key_button:hover { |
|
background: linear-gradient(135deg, #38a169 0%, #68d391 100%); /* Slightly darker green */ |
|
transform: translateY(-2px); |
|
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2); |
|
} |
|
|
|
/* Card styling */ |
|
.card { |
|
background-color: white; |
|
border-radius: 10px; |
|
padding: 20px; |
|
margin-bottom: 20px; |
|
box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); |
|
} |
|
|
|
/* Form styling */ |
|
.form-group { |
|
margin-bottom: 15px; |
|
} |
|
|
|
/* Tabs styling */ |
|
.tab-content { |
|
padding: 20px; |
|
background-color: white; |
|
border-radius: 0 0 10px 10px; |
|
} |
|
|
|
/* Table styling in output */ |
|
.output-container table { |
|
border-collapse: collapse; |
|
width: 100%; |
|
margin: 20px 0; |
|
} |
|
|
|
.output-container th, .output-container td { |
|
border: 1px solid #ddd; |
|
padding: 8px; |
|
text-align: left; |
|
} |
|
|
|
.output-container th { |
|
background-color: #f2f2f2; |
|
font-weight: bold; |
|
} |
|
|
|
.output-container tr:nth-child(even) { |
|
background-color: #f9f9f9; |
|
} |
|
|
|
/* Spinner styling */ |
|
.loading-spinner { |
|
display: inline-block; |
|
width: 20px; |
|
height: 20px; |
|
border: 3px solid rgba(0, 0, 0, 0.1); |
|
border-radius: 50%; |
|
border-top-color: #4a00e0; |
|
animation: spin 1s ease-in-out infinite; |
|
} |
|
|
|
@keyframes spin { |
|
to { |
|
transform: rotate(360deg); |
|
} |
|
} |
|
|
|
/* Customizations for Gradio */ |
|
.gradio-container { |
|
max-width: 1200px !important; |
|
} |
|
|
|
.gr-form, .gr-box { |
|
border-radius: 10px !important; |
|
} |
|
|
|
.gr-input, .gr-textarea { |
|
border-radius: 6px !important; |
|
} |
|
|
|
/* Responsive adjustments */ |
|
@media (max-width: 768px) { |
|
.header { |
|
padding: 15px; |
|
} |
|
|
|
#generate_button, #api_key_button { |
|
padding: 8px 16px; |
|
} |
|
} |
|
</style> |
|
""" |
|
|
|
|
|
header_html = """ |
|
<div class="header"> |
|
<h1>Systematic Review Generator for Research Papers</h1> |
|
<p>Upload multiple PDF papers to generate comprehensive reviews, literature analyses, and meta-analyses</p> |
|
</div> |
|
""" |
|
|
|
|
|
def progress_component(text, progress): |
|
return f""" |
|
<div style="margin: 10px 0; width: 100%;"> |
|
<div style="display: flex; align-items: center; margin-bottom: 5px;"> |
|
<div>{text}</div> |
|
<div style="margin-left: auto;">{progress}%</div> |
|
</div> |
|
<div style="background-color: #e0e0e0; height: 8px; border-radius: 4px; width: 100%;"> |
|
<div style="background: linear-gradient(135deg, #4a00e0 0%, #8e2de2 100%); height: 100%; width: {progress}%; border-radius: 4px;"></div> |
|
</div> |
|
</div> |
|
""" |
|
|
|
|
|
def create_review(files, question, model, review_type, include_tables, temperature, max_tokens, progress=gr.Progress()): |
|
try: |
|
if not files: |
|
return "Please upload at least one PDF file." |
|
|
|
progress(0.1, desc="Saving uploaded files...") |
|
saved_paths = save_uploaded_files(files) |
|
|
|
if not saved_paths: |
|
return "No valid PDF files were uploaded. Please upload PDF files only." |
|
|
|
progress(0.3, desc="Processing PDFs...") |
|
review = generate_systematic_review( |
|
saved_paths, |
|
question, |
|
model=model, |
|
review_type=review_type, |
|
include_tables=include_tables, |
|
temperature=temperature, |
|
max_tokens=max_tokens |
|
) |
|
|
|
progress(0.9, desc="Finalizing review...") |
|
|
|
|
|
for path in saved_paths: |
|
try: |
|
os.remove(path) |
|
except Exception as e: |
|
logger.error(f"Error removing temporary file {path}: {str(e)}") |
|
|
|
progress(1.0, desc="Complete!") |
|
return review |
|
|
|
except Exception as e: |
|
logger.error(f"Error in create_review: {str(e)}") |
|
return f"An error occurred: {str(e)}" |
|
|
|
|
|
def create_ui(): |
|
with gr.Blocks(css=css) as demo: |
|
gr.HTML(header_html) |
|
|
|
with gr.Tabs() as tabs: |
|
with gr.TabItem("Generate Review"): |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
with gr.Box(): |
|
gr.Markdown("### 1. Setup API Key") |
|
api_key_input = gr.Textbox( |
|
label="Enter OpenAI API Key", |
|
type="password", |
|
placeholder="sk-..." |
|
) |
|
api_key_button = gr.Button("Set API Key", elem_id="api_key_button") |
|
api_key_output = gr.Textbox( |
|
label="API Key Status", |
|
interactive=False, |
|
value="Not set" |
|
) |
|
|
|
with gr.Box(): |
|
gr.Markdown("### 2. Upload Papers") |
|
pdf_files = gr.File( |
|
label="Upload PDF Research Papers (PDF files only)", |
|
file_count="multiple", |
|
type="binary", |
|
file_types=[".pdf"] |
|
) |
|
|
|
with gr.Column(scale=1): |
|
with gr.Box(): |
|
gr.Markdown("### 3. Review Configuration") |
|
review_question = gr.Textbox( |
|
label="Review Question or Topic", |
|
placeholder="What are the current advances in GAN applications for speech processing?", |
|
lines=2 |
|
) |
|
|
|
review_type = gr.Radio( |
|
label="Review Type", |
|
choices=["systematic", "literature", "meta-analysis"], |
|
value="systematic" |
|
) |
|
|
|
model = gr.Dropdown( |
|
label="Model", |
|
choices=list(MODEL_OPTIONS.keys()), |
|
value="gpt-4.1" |
|
) |
|
|
|
with gr.Row(): |
|
include_tables = gr.Checkbox( |
|
label="Include Tables and Figures", |
|
value=True |
|
) |
|
|
|
with gr.Column(): |
|
temperature = gr.Slider( |
|
label="Temperature (Creativity)", |
|
minimum=0.0, |
|
maximum=1.0, |
|
value=0.7, |
|
step=0.1 |
|
) |
|
|
|
max_tokens = gr.Slider( |
|
label="Maximum Output Length", |
|
minimum=1000, |
|
maximum=8000, |
|
value=4000, |
|
step=500 |
|
) |
|
|
|
generate_button = gr.Button( |
|
"Generate Review", |
|
elem_id="generate_button", |
|
variant="primary" |
|
) |
|
|
|
|
|
with gr.Box(): |
|
gr.Markdown("### Review Output") |
|
review_output = gr.Markdown( |
|
label="Generated Review", |
|
value="Review will appear here after generation..." |
|
) |
|
|
|
with gr.Row(): |
|
copy_button = gr.Button("π Copy to Clipboard") |
|
export_button = gr.Button("π₯ Export as Markdown") |
|
|
|
with gr.TabItem("How to Use"): |
|
gr.Markdown(""" |
|
### Getting Started with the Systematic Review Generator |
|
|
|
#### 1. Setting Up |
|
- Enter your OpenAI API key in the field provided and click "Set API Key" |
|
- You'll need an API key with access to GPT-4 or GPT-3.5 for best results |
|
- Your API key is never stored and is only used for this session |
|
|
|
#### 2. Uploading Papers |
|
- Upload 2 or more PDF research papers (the more related they are, the better) |
|
- Only PDF files are supported |
|
- Papers should ideally be related to the same research field |
|
|
|
#### 3. Configuring Your Review |
|
- Enter a specific review question or topic |
|
- Choose the review type: |
|
- **Systematic Review**: Follows a rigorous methodology to answer a specific research question |
|
- **Literature Review**: Provides an overview of existing research on a topic |
|
- **Meta-Analysis**: Combines and analyzes quantitative data from multiple studies |
|
- Select the AI model (GPT-4 recommended for complex papers) |
|
- Adjust temperature (higher = more creative, lower = more focused) |
|
- Set maximum output length (longer reviews will be more comprehensive) |
|
|
|
#### 4. Generating Your Review |
|
- Click "Generate Review" to start the process |
|
- Processing time depends on the number and size of papers, and the selected model |
|
- You can copy or export the final review when complete |
|
|
|
#### Tips for Best Results |
|
- Use papers from the same field or on related topics |
|
- Be specific in your review question |
|
- For technical papers, choose GPT-4 for better comprehension |
|
- The system works best with 2-5 related papers |
|
- Consider using a lower temperature (0.3-0.5) for more factual reviews |
|
""") |
|
|
|
with gr.TabItem("About"): |
|
gr.Markdown(""" |
|
### About the Systematic Review Generator |
|
|
|
This application helps researchers, students, and academics generate comprehensive reviews of scientific papers. It leverages advanced AI to analyze PDF research papers and synthesize findings into structured, coherent reviews. |
|
|
|
#### Features |
|
- Support for multiple review types: systematic reviews, literature reviews, and meta-analyses |
|
- Automatic extraction of text from PDF files |
|
- Parallel processing of multiple papers |
|
- Integration with OpenAI's GPT models |
|
- Customizable output parameters |
|
- Table and figure generation capabilities |
|
|
|
#### How It Works |
|
1. The system extracts text from your uploaded PDFs |
|
2. It identifies the main topics, methodologies, and findings |
|
3. Based on your review question, it synthesizes information across papers |
|
4. It structures the information following academic review standards |
|
5. It provides a comprehensive review with proper sections and references |
|
|
|
#### Limitations |
|
- The quality of the review depends on the clarity of the PDFs and their text extraction |
|
- Complex scientific notation, tables, or images in PDFs may not be perfectly interpreted |
|
- The system provides a starting point, not a final paper - always review and verify the output |
|
- Token limits may prevent full analysis of very long or numerous papers |
|
|
|
#### Privacy & Security |
|
- Your API key is never stored and is only used for the current session |
|
- Uploaded PDFs are processed temporarily and deleted after review generation |
|
- No data is retained after you close the application |
|
""") |
|
|
|
|
|
api_key_button.click(set_api_key, inputs=[api_key_input], outputs=[api_key_output]) |
|
|
|
generate_button.click( |
|
create_review, |
|
inputs=[pdf_files, review_question, model, review_type, include_tables, temperature, max_tokens], |
|
outputs=[review_output] |
|
) |
|
|
|
|
|
def refresh_models(): |
|
return gr.Dropdown.update(choices=get_available_models()) |
|
|
|
api_key_button.click(refresh_models, outputs=[model]) |
|
|
|
|
|
|
|
return demo |
|
|
|
|
|
if __name__ == "__main__": |
|
demo = create_ui() |
|
demo.launch(share=True) |