PyQuarX commited on
Commit
3e80e9e
·
verified ·
1 Parent(s): a9ab18d

Update parse.py

Browse files
Files changed (1) hide show
  1. parse.py +41 -83
parse.py CHANGED
@@ -1,93 +1,59 @@
1
- import pandas as pd
2
  from langchain_core.prompts import ChatPromptTemplate
3
- from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
4
- from huggingface_hub import login
5
- import torch
6
  import os
 
7
 
8
- # Hugging Face API Token from Space Secrets
9
- HF_TOKEN = os.getenv("HF_TOKEN")
10
- if not HF_TOKEN:
11
- raise ValueError("HF_TOKEN environment variable not set. Please set it in Hugging Face Space Settings under Secrets.")
12
-
13
- # Model configuration
14
- MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"
15
-
16
- # Initialize model and tokenizer
17
- try:
18
- # Log in to Hugging Face Hub
19
- login(token=HF_TOKEN)
20
- print("Successfully logged in to Hugging Face Hub")
21
 
22
- # Load tokenizer and model
23
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
24
- model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
25
 
26
- # Create text generation pipeline
27
- llm_pipeline = pipeline(
28
- "text-generation",
29
- model=model,
30
- tokenizer=tokenizer,
31
- device=0 if torch.cuda.is_available() else -1, # Use GPU if available in Space
32
- max_new_tokens=500, # Limit response length
33
- pad_token_id=tokenizer.eos_token_id, # Ensure proper padding
34
- )
35
- except Exception as e:
36
- print(f"Failed to load model: {str(e)}")
37
- llm_pipeline = None
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  # Function to parse and extract information from the chunks
40
  def parse(dom_chunks, parse_description):
41
- """Parse and extract information from DOM chunks using a local LLM."""
42
- if llm_pipeline is None:
43
- raise ValueError("LLM pipeline not initialized. Check model loading and ensure HF_TOKEN is set in Space Secrets.")
44
-
45
- # Create a prompt template
46
- template = (
47
- "You are tasked with extracting specific information from the following text content: {dom_content}. "
48
- "Please follow these instructions carefully:\n\n"
49
- "1. **Task:** Extract data from the provided text that matches the description: {parse_description}.\n"
50
- "2. **Output Format:** Return the extracted data ONLY as one or more Markdown tables. Each table MUST be correctly formatted.\n"
51
- "3. **Markdown Table Format:** Each table must adhere to the following Markdown format:\n"
52
- " - Start with a header row, clearly labeling each column, separated by pipes (|).\n"
53
- " - Follow the header row with an alignment row, using hyphens (-) to indicate column alignment (e.g., --- for left alignment).\n"
54
- " - Subsequent rows should contain the data, with cells aligned according to the alignment row.\n"
55
- " - Use pipes (|) to separate columns in each data row.\n"
56
- "4. **No Explanations:** Do not include any introductory or explanatory text before or after the table(s).\n"
57
- "5. **Empty Response:** If no information matches the description, return an empty string ('').\n"
58
- "6. **Multiple Tables:** If the text contains multiple tables matching the description, return each table separately, following the Markdown format for each.\n"
59
- "7. **Accuracy:** The extracted data must be accurate and reflect the information in the provided text.\n"
60
- )
61
 
62
  parsed_results = []
63
 
64
  # Loop through the chunks and parse
65
  for i, chunk in enumerate(dom_chunks, start=1):
66
- # Format the prompt
67
- prompt = template.format(dom_content=chunk, parse_description=parse_description)
68
-
69
- # Invoke the LLM pipeline
70
- response = llm_pipeline(prompt, max_length=2000, truncation=True)
71
- result = response[0]["generated_text"]
72
-
73
- # Clean the output to keep only the Markdown table (remove prompt text)
74
- start_idx = result.find("|")
75
- if start_idx != -1:
76
- result = result[start_idx:]
77
- else:
78
- result = "" # Return empty string if no table is found
79
 
 
80
  print(f"Parsed batch {i} of {len(dom_chunks)}")
81
- parsed_results.append(result)
82
 
83
  # Return the parsed results as a single string
84
  return "\n".join(parsed_results)
85
 
86
  def merge_tables_with_llm(tables, parse_description):
87
- """Merges a list of Pandas DataFrames into a single Markdown table using a local LLM."""
88
- if llm_pipeline is None:
89
- raise ValueError("LLM pipeline not initialized. Check model loading and ensure HF_TOKEN is set in Space Secrets.")
90
-
91
  # Convert DataFrames to Markdown strings
92
  table_strings = [table.to_markdown(index=False) for table in tables]
93
 
@@ -109,17 +75,9 @@ def merge_tables_with_llm(tables, parse_description):
109
  "7. **Missing Values:** If there are missing values, fill them with 'N/A'.\n\n"
110
  "Here are the tables:\n\n" + "\n\n".join(table_strings) +
111
  "\n\nReturn the merged table in Markdown format:"
112
- ).format(parse_description=parse_description)
113
-
114
- # Invoke the LLM pipeline
115
- response = llm_pipeline(merge_prompt, max_length=2000, truncation=True)
116
- merged_table = response[0]["generated_text"]
117
-
118
- # Clean the output to keep only the Markdown table
119
- start_idx = merged_table.find("|")
120
- if start_idx != -1:
121
- merged_table = merged_table[start_idx:]
122
- else:
123
- merged_table = "" # Return empty string if no table is found
124
 
125
- return merged_table
 
 
 
 
 
1
  from langchain_core.prompts import ChatPromptTemplate
2
+ from langchain_openai import ChatOpenAI
3
+ from langchain_core.messages import HumanMessage
 
4
  import os
5
+ import pandas as pd
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
 
 
 
8
 
9
+ # Load OpenRouter API Key
10
+ openrouter_api_key = os.getenv("API_MV")
11
+
12
+ model = ChatOpenAI(
13
+ openai_api_key=openrouter_api_key, # Use OpenRouter API key
14
+ model="meta-llama/llama-4-maverick:free", # Specify Qwen VL Plus model
15
+ base_url="https://openrouter.ai/api/v1" # OpenRouter API URL
16
+ )
17
+
18
+ # Create a chat prompt template
19
+ template = (
20
+ "You are tasked with extracting specific information from the following text content: {dom_content}. "
21
+ "Please follow these instructions carefully:\n\n"
22
+ "1. **Task:** Extract data from the provided text that matches the description: {parse_description}.\n"
23
+ "2. **Output Format:** Return the extracted data ONLY as one or more Markdown tables. Each table MUST be correctly formatted.\n"
24
+ "3. **Markdown Table Format:** Each table must adhere to the following Markdown format:\n"
25
+ " - Start with a header row, clearly labeling each column, separated by pipes (|).\n"
26
+ " - Follow the header row with an alignment row, using hyphens (-) to indicate column alignment (e.g., --- for left alignment).\n"
27
+ " - Subsequent rows should contain the data, with cells aligned according to the alignment row.\n"
28
+ " - Use pipes (|) to separate columns in each data row.\n"
29
+ "4. **No Explanations:** Do not include any introductory or explanatory text before or after the table(s).\n"
30
+ "5. **Empty Response:** If no information matches the description, return an empty string ('').\n"
31
+ "6. **Multiple Tables:** If the text contains multiple tables matching the description, return each table separately, following the Markdown format for each.\n"
32
+ "7. **Accuracy:** The extracted data must be accurate and reflect the information in the provided text.\n"
33
+ )
34
 
35
  # Function to parse and extract information from the chunks
36
  def parse(dom_chunks, parse_description):
37
+ prompt = ChatPromptTemplate.from_template(template)
38
+ chain = prompt | model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  parsed_results = []
41
 
42
  # Loop through the chunks and parse
43
  for i, chunk in enumerate(dom_chunks, start=1):
44
+ response = chain.invoke({"dom_content": chunk, "parse_description": parse_description})
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
+ # Extract the content from AIMessage and add it to the results
47
  print(f"Parsed batch {i} of {len(dom_chunks)}")
48
+ parsed_results.append(response.content) # Ensure content is extracted properly
49
 
50
  # Return the parsed results as a single string
51
  return "\n".join(parsed_results)
52
 
53
  def merge_tables_with_llm(tables, parse_description):
54
+ """Merges a list of Pandas DataFrames into a single Markdown table using LLM."""
55
+ from langchain_core.prompts import ChatPromptTemplate
56
+ from langchain_openai import ChatOpenAI
 
57
  # Convert DataFrames to Markdown strings
58
  table_strings = [table.to_markdown(index=False) for table in tables]
59
 
 
75
  "7. **Missing Values:** If there are missing values, fill them with 'N/A'.\n\n"
76
  "Here are the tables:\n\n" + "\n\n".join(table_strings) +
77
  "\n\nReturn the merged table in Markdown format:"
78
+ )
 
 
 
 
 
 
 
 
 
 
 
79
 
80
+ # Invoke the LLM
81
+ message = HumanMessage(content=merge_prompt)
82
+ response = model.invoke([message])
83
+ return response.content