PyQuarX commited on
Commit
7d633ab
·
verified ·
1 Parent(s): fb0ee3f

Update parse.py

Browse files
Files changed (1) hide show
  1. parse.py +102 -82
parse.py CHANGED
@@ -1,105 +1,125 @@
1
- import streamlit as st
2
-
3
- from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
4
  import pandas as pd
5
-
6
- # Initialiser le modèle LLM avec transformers
7
- model_name = "meta-llama/Llama-4-Maverick-17B-128E-Instruct" # Modèle léger pour cet exemple
8
- tokenizer = AutoTokenizer.from_pretrained(model_name)
9
- model = AutoModelForCausalLM.from_pretrained(model_name)
10
-
11
- # Créer un pipeline pour la génération de texte
12
- llm_pipeline = pipeline(
13
- "text-generation",
14
- model=model,
15
- tokenizer=tokenizer,
16
- device=0 if torch.cuda.is_available() else -1, # Utiliser GPU si disponible
17
- max_new_tokens=500, # Limiter la longueur de la réponse
18
- )
19
-
20
- st.title("AI Web Scraper")
21
-
22
- # Multi-URL Input
23
- urls = st.text_area("Enter Website URLs (one per line)", height=150)
24
- urls_list = [url.strip() for url in urls.splitlines() if url.strip()]
25
-
26
- if st.button("Scrape Sites"):
27
- all_results = []
28
- for url in urls_list:
29
- st.write(f"Scraping: {url}")
30
- result = scrape_website(url)
31
- body_content = extract_body_content(result)
32
- cleaned_content = clean_body_content(body_content)
33
- all_results.append(cleaned_content)
34
-
35
- st.session_state.all_dom_content = all_results
36
-
37
- if "all_dom_content" in st.session_state:
38
- parse_description = st.text_area("Describe what you want to parse from ALL sites:")
39
-
40
- if st.button("Parse Content"):
41
- if parse_description:
42
- all_tables = []
43
- for i, dom_content in enumerate(st.session_state.all_dom_content):
44
- st.write(f"Parsing content from site {i+1}")
45
- dom_chunks = split_dom_content(dom_content)
46
- result = parse(dom_chunks, parse_description) # Assurez-vous que cette fonction utilise le modèle local
47
- st.write("Raw LLM Output:")
48
- st.write(result)
49
-
50
- tables = markdown_to_csv(result)
51
- if tables:
52
- st.write("Extracted Tables:")
53
- for table in tables:
54
- st.write(table)
55
- all_tables.append(table)
56
- else:
57
- st.write("No tables found in the output. Displaying raw output instead.")
58
- st.text_area("Raw Output", result, height=200)
59
-
60
- # Merge tables using LLM
61
- if all_tables:
62
- st.write("Merging all tables using LLM...")
63
- merged_table_string = merge_tables_with_llm(all_tables, parse_description)
64
- st.write("Merged Table (LLM Output):")
65
- st.write(merged_table_string)
66
-
67
- # Convert merged table string to DataFrame
68
- merged_tables = markdown_to_csv(merged_table_string)
69
- if merged_tables:
70
- st.write("Merged Table (DataFrame):")
71
- st.write(merged_tables[0])
72
- else:
73
- st.write("Could not convert merged table string to DataFrame.")
74
- else:
75
- st.write("No tables to merge.")
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
  def merge_tables_with_llm(tables, parse_description):
78
  """Merges a list of Pandas DataFrames into a single Markdown table using a local LLM."""
 
 
 
79
  # Convert DataFrames to Markdown strings
80
  table_strings = [table.to_markdown(index=False) for table in tables]
81
 
82
- # Créer un prompt pour le LLM
83
  merge_prompt = (
84
  "You are tasked with merging the following Markdown tables into a single, comprehensive Markdown table.\n"
85
  "The tables contain information related to: {parse_description}.\n"
86
- "Combine the tables, ensuring that the merged table is well-formatted and contains all relevant information.\n"
87
- "If there are duplicate columns, rename them to be unique. If there are missing values, fill them with 'N/A'.\n"
88
- "Ensure the final output is a single valid Markdown table.\n\n"
 
 
 
 
 
 
 
 
 
89
  "Here are the tables:\n\n" + "\n\n".join(table_strings) +
90
  "\n\nReturn the merged table in Markdown format:"
91
  ).format(parse_description=parse_description)
92
 
93
- # Appeler le modèle local via le pipeline
94
  response = llm_pipeline(merge_prompt, max_length=2000, truncation=True)
95
  merged_table = response[0]["generated_text"]
96
 
97
- # Nettoyer la sortie pour ne garder que la table Markdown
98
- # Supposons que la table commence après le prompt
99
  start_idx = merged_table.find("|")
100
  if start_idx != -1:
101
  merged_table = merged_table[start_idx:]
102
  else:
103
- merged_table = "No valid Markdown table found in LLM output."
104
 
105
  return merged_table
 
 
 
 
1
  import pandas as pd
2
+ from langchain_core.prompts import ChatPromptTemplate
3
+ from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
4
+ from huggingface_hub import login
5
+ import torch
6
+ import os
7
+
8
+ # Hugging Face API Token from Space Secrets
9
+ HF_TOKEN = os.getenv("HF_TOKEN")
10
+ if not HF_TOKEN:
11
+ raise ValueError("HF_TOKEN environment variable not set. Please set it in Hugging Face Space Settings under Secrets.")
12
+
13
+ # Model configuration
14
+ MODEL_NAME = "facebook/opt-125m" # Lightweight model; replace with e.g., mistralai/Mixtral-8x7B-Instruct-v0.1 for paid Spaces with GPU
15
+
16
+ # Initialize model and tokenizer
17
+ try:
18
+ # Log in to Hugging Face Hub
19
+ login(token=HF_TOKEN)
20
+ print("Successfully logged in to Hugging Face Hub")
21
+
22
+ # Load tokenizer and model
23
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
24
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
25
+
26
+ # Create text generation pipeline
27
+ llm_pipeline = pipeline(
28
+ "text-generation",
29
+ model=model,
30
+ tokenizer=tokenizer,
31
+ device=0 if torch.cuda.is_available() else -1, # Use GPU if available in Space
32
+ max_new_tokens=500, # Limit response length
33
+ pad_token_id=tokenizer.eos_token_id, # Ensure proper padding
34
+ )
35
+ except Exception as e:
36
+ print(f"Failed to load model: {str(e)}")
37
+ llm_pipeline = None
38
+
39
+ # Function to parse and extract information from the chunks
40
+ def parse(dom_chunks, parse_description):
41
+ """Parse and extract information from DOM chunks using a local LLM."""
42
+ if llm_pipeline is None:
43
+ raise ValueError("LLM pipeline not initialized. Check model loading and ensure HF_TOKEN is set in Space Secrets.")
44
+
45
+ # Create a prompt template
46
+ template = (
47
+ "You are tasked with extracting specific information from the following text content: {dom_content}. "
48
+ "Please follow these instructions carefully:\n\n"
49
+ "1. **Task:** Extract data from the provided text that matches the description: {parse_description}.\n"
50
+ "2. **Output Format:** Return the extracted data ONLY as one or more Markdown tables. Each table MUST be correctly formatted.\n"
51
+ "3. **Markdown Table Format:** Each table must adhere to the following Markdown format:\n"
52
+ " - Start with a header row, clearly labeling each column, separated by pipes (|).\n"
53
+ " - Follow the header row with an alignment row, using hyphens (-) to indicate column alignment (e.g., --- for left alignment).\n"
54
+ " - Subsequent rows should contain the data, with cells aligned according to the alignment row.\n"
55
+ " - Use pipes (|) to separate columns in each data row.\n"
56
+ "4. **No Explanations:** Do not include any introductory or explanatory text before or after the table(s).\n"
57
+ "5. **Empty Response:** If no information matches the description, return an empty string ('').\n"
58
+ "6. **Multiple Tables:** If the text contains multiple tables matching the description, return each table separately, following the Markdown format for each.\n"
59
+ "7. **Accuracy:** The extracted data must be accurate and reflect the information in the provided text.\n"
60
+ )
61
+
62
+ parsed_results = []
63
+
64
+ # Loop through the chunks and parse
65
+ for i, chunk in enumerate(dom_chunks, start=1):
66
+ # Format the prompt
67
+ prompt = template.format(dom_content=chunk, parse_description=parse_description)
68
+
69
+ # Invoke the LLM pipeline
70
+ response = llm_pipeline(prompt, max_length=2000, truncation=True)
71
+ result = response[0]["generated_text"]
72
+
73
+ # Clean the output to keep only the Markdown table (remove prompt text)
74
+ start_idx = result.find("|")
75
+ if start_idx != -1:
76
+ result = result[start_idx:]
77
+ else:
78
+ result = "" # Return empty string if no table is found
79
+
80
+ print(f"Parsed batch {i} of {len(dom_chunks)}")
81
+ parsed_results.append(result)
82
+
83
+ # Return the parsed results as a single string
84
+ return "\n".join(parsed_results)
85
 
86
  def merge_tables_with_llm(tables, parse_description):
87
  """Merges a list of Pandas DataFrames into a single Markdown table using a local LLM."""
88
+ if llm_pipeline is None:
89
+ raise ValueError("LLM pipeline not initialized. Check model loading and ensure HF_TOKEN is set in Space Secrets.")
90
+
91
  # Convert DataFrames to Markdown strings
92
  table_strings = [table.to_markdown(index=False) for table in tables]
93
 
94
+ # Create a prompt for the LLM
95
  merge_prompt = (
96
  "You are tasked with merging the following Markdown tables into a single, comprehensive Markdown table.\n"
97
  "The tables contain information related to: {parse_description}.\n"
98
+ "Please follow these instructions carefully:\n\n"
99
+ "1. **Task:** Merge the data from the following tables into a single table that matches the description: {parse_description}.\n"
100
+ "2. **Output Format:** Return the merged data ONLY as a single Markdown table. The table MUST be correctly formatted.\n"
101
+ "3. **Markdown Table Format:** The table must adhere to the following Markdown format:\n"
102
+ " - Start with a header row, clearly labeling each column, separated by pipes (|).\n"
103
+ " - Follow the header row with an alignment row, using hyphens (-) to indicate column alignment (e.g., --- for left alignment).\n"
104
+ " - Subsequent rows should contain the data, with cells aligned according to the alignment row.\n"
105
+ " - Use pipes (|) to separate columns in each data row.\n"
106
+ "4. **No Explanations:** Do not include any introductory or explanatory text before or after the table.\n"
107
+ "5. **Empty Response:** If no information matches the description, return an empty string ('') if no data can be merged.\n"
108
+ "6. **Duplicate Columns:** If there are duplicate columns, rename them to be unique.\n"
109
+ "7. **Missing Values:** If there are missing values, fill them with 'N/A'.\n\n"
110
  "Here are the tables:\n\n" + "\n\n".join(table_strings) +
111
  "\n\nReturn the merged table in Markdown format:"
112
  ).format(parse_description=parse_description)
113
 
114
+ # Invoke the LLM pipeline
115
  response = llm_pipeline(merge_prompt, max_length=2000, truncation=True)
116
  merged_table = response[0]["generated_text"]
117
 
118
+ # Clean the output to keep only the Markdown table
 
119
  start_idx = merged_table.find("|")
120
  if start_idx != -1:
121
  merged_table = merged_table[start_idx:]
122
  else:
123
+ merged_table = "" # Return empty string if no table is found
124
 
125
  return merged_table