PyQuarX commited on
Commit
fb65c7a
·
verified ·
1 Parent(s): f64f0d6

Update parse.py

Browse files
Files changed (1) hide show
  1. parse.py +85 -61
parse.py CHANGED
@@ -1,83 +1,107 @@
1
- from langchain_core.prompts import ChatPromptTemplate
2
- from langchain_openai import ChatOpenAI
3
- from langchain_core.messages import HumanMessage
4
- import os
 
5
  import pandas as pd
6
 
 
 
 
 
7
 
 
 
 
 
 
 
 
 
8
 
9
- # Load OpenRouter API Key
10
- openrouter_api_key = "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"
11
 
12
- model = ChatOpenAI(
13
- openai_api_key=openrouter_api_key, # Use OpenRouter API key
14
- model="meta-llama/llama-4-maverick:free", # Specify Qwen VL Plus model
15
- base_url="https://openrouter.ai/api/v1" # OpenRouter API URL
16
- )
17
 
18
- # Create a chat prompt template
19
- template = (
20
- "You are tasked with extracting specific information from the following text content: {dom_content}. "
21
- "Please follow these instructions carefully:\n\n"
22
- "1. **Task:** Extract data from the provided text that matches the description: {parse_description}.\n"
23
- "2. **Output Format:** Return the extracted data ONLY as one or more Markdown tables. Each table MUST be correctly formatted.\n"
24
- "3. **Markdown Table Format:** Each table must adhere to the following Markdown format:\n"
25
- " - Start with a header row, clearly labeling each column, separated by pipes (|).\n"
26
- " - Follow the header row with an alignment row, using hyphens (-) to indicate column alignment (e.g., --- for left alignment).\n"
27
- " - Subsequent rows should contain the data, with cells aligned according to the alignment row.\n"
28
- " - Use pipes (|) to separate columns in each data row.\n"
29
- "4. **No Explanations:** Do not include any introductory or explanatory text before or after the table(s).\n"
30
- "5. **Empty Response:** If no information matches the description, return an empty string ('').\n"
31
- "6. **Multiple Tables:** If the text contains multiple tables matching the description, return each table separately, following the Markdown format for each.\n"
32
- "7. **Accuracy:** The extracted data must be accurate and reflect the information in the provided text.\n"
33
- )
34
 
35
- # Function to parse and extract information from the chunks
36
- def parse(dom_chunks, parse_description):
37
- prompt = ChatPromptTemplate.from_template(template)
38
- chain = prompt | model
39
 
40
- parsed_results = []
 
41
 
42
- # Loop through the chunks and parse
43
- for i, chunk in enumerate(dom_chunks, start=1):
44
- response = chain.invoke({"dom_content": chunk, "parse_description": parse_description})
 
 
 
 
 
 
45
 
46
- # Extract the content from AIMessage and add it to the results
47
- print(f"Parsed batch {i} of {len(dom_chunks)}")
48
- parsed_results.append(response.content) # Ensure content is extracted properly
 
 
 
 
 
 
49
 
50
- # Return the parsed results as a single string
51
- return "\n".join(parsed_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  def merge_tables_with_llm(tables, parse_description):
54
- """Merges a list of Pandas DataFrames into a single Markdown table using LLM."""
55
- from langchain_core.prompts import ChatPromptTemplate
56
- from langchain_openai import ChatOpenAI
57
  # Convert DataFrames to Markdown strings
58
  table_strings = [table.to_markdown(index=False) for table in tables]
59
 
60
- # Create a prompt for the LLM
61
  merge_prompt = (
62
  "You are tasked with merging the following Markdown tables into a single, comprehensive Markdown table.\n"
63
  "The tables contain information related to: {parse_description}.\n"
64
- "Please follow these instructions carefully:\n\n"
65
- "1. **Task:** Merge the data from the following tables into a single table that matches the description: {parse_description}.\n"
66
- "2. **Output Format:** Return the merged data ONLY as a single Markdown table. The table MUST be correctly formatted.\n"
67
- "3. **Markdown Table Format:** The table must adhere to the following Markdown format:\n"
68
- " - Start with a header row, clearly labeling each column, separated by pipes (|).\n"
69
- " - Follow the header row with an alignment row, using hyphens (-) to indicate column alignment (e.g., --- for left alignment).\n"
70
- " - Subsequent rows should contain the data, with cells aligned according to the alignment row.\n"
71
- " - Use pipes (|) to separate columns in each data row.\n"
72
- "4. **No Explanations:** Do not include any introductory or explanatory text before or after the table.\n"
73
- "5. **Empty Response:** If no information matches the description, return an empty string ('') if no data can be merged.\n"
74
- "6. **Duplicate Columns:** If there are duplicate columns, rename them to be unique.\n"
75
- "7. **Missing Values:** If there are missing values, fill them with 'N/A'.\n\n"
76
  "Here are the tables:\n\n" + "\n\n".join(table_strings) +
77
  "\n\nReturn the merged table in Markdown format:"
78
- )
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- # Invoke the LLM
81
- message = HumanMessage(content=merge_prompt)
82
- response = model.invoke([message])
83
- return response.content
 
1
+ import streamlit as st
2
+ from scraper import scrape_website, split_dom_content, clean_body_content, extract_body_content
3
+ from parse import parse, merge_tables_with_llm
4
+ from Data import markdown_to_csv
5
+ from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
6
  import pandas as pd
7
 
8
+ # Initialiser le modèle LLM avec transformers
9
+ model_name = "meta-llama/Llama-4-Maverick-17B-128E-Instruct" # Modèle léger pour cet exemple
10
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
11
+ model = AutoModelForCausalLM.from_pretrained(model_name)
12
 
13
+ # Créer un pipeline pour la génération de texte
14
+ llm_pipeline = pipeline(
15
+ "text-generation",
16
+ model=model,
17
+ tokenizer=tokenizer,
18
+ device=0 if torch.cuda.is_available() else -1, # Utiliser GPU si disponible
19
+ max_new_tokens=500, # Limiter la longueur de la réponse
20
+ )
21
 
22
+ st.title("AI Web Scraper")
 
23
 
24
+ # Multi-URL Input
25
+ urls = st.text_area("Enter Website URLs (one per line)", height=150)
26
+ urls_list = [url.strip() for url in urls.splitlines() if url.strip()]
 
 
27
 
28
+ if st.button("Scrape Sites"):
29
+ all_results = []
30
+ for url in urls_list:
31
+ st.write(f"Scraping: {url}")
32
+ result = scrape_website(url)
33
+ body_content = extract_body_content(result)
34
+ cleaned_content = clean_body_content(body_content)
35
+ all_results.append(cleaned_content)
 
 
 
 
 
 
 
 
36
 
37
+ st.session_state.all_dom_content = all_results
 
 
 
38
 
39
+ if "all_dom_content" in st.session_state:
40
+ parse_description = st.text_area("Describe what you want to parse from ALL sites:")
41
 
42
+ if st.button("Parse Content"):
43
+ if parse_description:
44
+ all_tables = []
45
+ for i, dom_content in enumerate(st.session_state.all_dom_content):
46
+ st.write(f"Parsing content from site {i+1}")
47
+ dom_chunks = split_dom_content(dom_content)
48
+ result = parse(dom_chunks, parse_description) # Assurez-vous que cette fonction utilise le modèle local
49
+ st.write("Raw LLM Output:")
50
+ st.write(result)
51
 
52
+ tables = markdown_to_csv(result)
53
+ if tables:
54
+ st.write("Extracted Tables:")
55
+ for table in tables:
56
+ st.write(table)
57
+ all_tables.append(table)
58
+ else:
59
+ st.write("No tables found in the output. Displaying raw output instead.")
60
+ st.text_area("Raw Output", result, height=200)
61
 
62
+ # Merge tables using LLM
63
+ if all_tables:
64
+ st.write("Merging all tables using LLM...")
65
+ merged_table_string = merge_tables_with_llm(all_tables, parse_description)
66
+ st.write("Merged Table (LLM Output):")
67
+ st.write(merged_table_string)
68
+
69
+ # Convert merged table string to DataFrame
70
+ merged_tables = markdown_to_csv(merged_table_string)
71
+ if merged_tables:
72
+ st.write("Merged Table (DataFrame):")
73
+ st.write(merged_tables[0])
74
+ else:
75
+ st.write("Could not convert merged table string to DataFrame.")
76
+ else:
77
+ st.write("No tables to merge.")
78
 
79
  def merge_tables_with_llm(tables, parse_description):
80
+ """Merges a list of Pandas DataFrames into a single Markdown table using a local LLM."""
 
 
81
  # Convert DataFrames to Markdown strings
82
  table_strings = [table.to_markdown(index=False) for table in tables]
83
 
84
+ # Créer un prompt pour le LLM
85
  merge_prompt = (
86
  "You are tasked with merging the following Markdown tables into a single, comprehensive Markdown table.\n"
87
  "The tables contain information related to: {parse_description}.\n"
88
+ "Combine the tables, ensuring that the merged table is well-formatted and contains all relevant information.\n"
89
+ "If there are duplicate columns, rename them to be unique. If there are missing values, fill them with 'N/A'.\n"
90
+ "Ensure the final output is a single valid Markdown table.\n\n"
 
 
 
 
 
 
 
 
 
91
  "Here are the tables:\n\n" + "\n\n".join(table_strings) +
92
  "\n\nReturn the merged table in Markdown format:"
93
+ ).format(parse_description=parse_description)
94
+
95
+ # Appeler le modèle local via le pipeline
96
+ response = llm_pipeline(merge_prompt, max_length=2000, truncation=True)
97
+ merged_table = response[0]["generated_text"]
98
+
99
+ # Nettoyer la sortie pour ne garder que la table Markdown
100
+ # Supposons que la table commence après le prompt
101
+ start_idx = merged_table.find("|")
102
+ if start_idx != -1:
103
+ merged_table = merged_table[start_idx:]
104
+ else:
105
+ merged_table = "No valid Markdown table found in LLM output."
106
 
107
+ return merged_table