Spaces:
Running
Running
| import logging | |
| import time | |
| import re | |
| from langdetect import detect | |
| import spacy | |
| from utils.performance import PerformanceTracker | |
| from utils.models import get_nlp_model, get_llm_model | |
| from modules.classification import normalize_tense | |
| logger = logging.getLogger("misinformation_detector") | |
| performance_tracker = PerformanceTracker() | |
| def extract_claims(text): | |
| """ | |
| Extract the main factual claim from the provided text. | |
| For concise claims (<30 words), preserves them exactly. | |
| For longer text, uses OpenAI to extract the claim. | |
| """ | |
| logger.info(f"Extracting claims from: {text}") | |
| start_time = time.time() | |
| # First, check if the input already appears to be a concise claim | |
| if len(text.split()) < 30: | |
| logger.info("Input appears to be a concise claim already, preserving as-is") | |
| performance_tracker.log_processing_time(start_time) | |
| performance_tracker.log_claim_processed() | |
| return text | |
| try: | |
| # For longer text, use OpenAI for extraction | |
| extracted_claim = extract_with_openai(text) | |
| # Log processing time | |
| performance_tracker.log_processing_time(start_time) | |
| performance_tracker.log_claim_processed() | |
| logger.info(f"Extracted claim: {extracted_claim}") | |
| return extracted_claim | |
| except Exception as e: | |
| logger.error(f"Error extracting claims: {str(e)}") | |
| # Fallback to original text on error | |
| return text | |
| def extract_with_openai(text): | |
| """ | |
| Use OpenAI model for claim extraction | |
| """ | |
| try: | |
| # Get LLM model | |
| llm_model = get_llm_model() | |
| # Create a very explicit prompt to avoid hallucination | |
| prompt = f""" | |
| Extract the main factual claim from the following text. | |
| DO NOT add any information not present in the original text. | |
| DO NOT add locations, dates, or other details. | |
| ONLY extract what is explicitly stated. | |
| Text: {text} | |
| Main factual claim: | |
| """ | |
| # Call OpenAI with temperature=0 for deterministic output | |
| response = llm_model.invoke(prompt, temperature=0) | |
| extracted_claim = response.content.strip() | |
| # Further clean up any explanations or extra text | |
| if ":" in extracted_claim: | |
| parts = extracted_claim.split(":") | |
| if len(parts) > 1: | |
| extracted_claim = parts[-1].strip() | |
| logger.info(f"OpenAI extraction: {extracted_claim}") | |
| # Validate that we're not adding info not in the original | |
| nlp = get_nlp_model() | |
| extracted_claim = validate_extraction(text, extracted_claim, nlp) | |
| return extracted_claim | |
| except Exception as e: | |
| logger.error(f"Error in OpenAI claim extraction: {str(e)}") | |
| return text # Fallback to original | |
| def validate_extraction(original_text, extracted_claim, nlp): | |
| """ | |
| Validate that the extracted claim doesn't add information not present in the original text | |
| """ | |
| # If extraction fails or is empty, return original | |
| if not extracted_claim or extracted_claim.strip() == "": | |
| logger.warning("Empty extraction result, using original text") | |
| return original_text | |
| # Check for added location information | |
| location_terms = ["united states", "america", "u.s.", "usa", "china", "india", "europe", | |
| "russia", "japan", "uk", "germany", "france", "australia"] | |
| for term in location_terms: | |
| if term in extracted_claim.lower() and term not in original_text.lower(): | |
| logger.warning(f"Extraction added location '{term}' not in original, using original text") | |
| return original_text | |
| # Check for entity preservation/addition using spaCy | |
| try: | |
| # Get entities from extracted text | |
| extracted_doc = nlp(extracted_claim) | |
| extracted_entities = [ent.text.lower() for ent in extracted_doc.ents] | |
| # Get entities from original text | |
| original_doc = nlp(original_text) | |
| original_entities = [ent.text.lower() for ent in original_doc.ents] | |
| # Check for new entities that don't exist in original | |
| for entity in extracted_entities: | |
| if not any(entity in orig_entity or orig_entity in entity for orig_entity in original_entities): | |
| logger.warning(f"Extraction added new entity '{entity}', using original text") | |
| return original_text | |
| return extracted_claim | |
| except Exception as e: | |
| logger.error(f"Error in extraction validation: {str(e)}") | |
| return original_text # On error, safer to return original | |
| def shorten_claim_for_evidence(claim): | |
| """ | |
| Shorten a claim to use for evidence retrieval by preserving important entities, | |
| verbs, and keywords while maintaining claim context | |
| Args: | |
| claim (str): The original claim | |
| Returns: | |
| str: A shortened version of the claim optimized for evidence retrieval | |
| """ | |
| try: | |
| normalized_claim = normalize_tense(claim) | |
| # Get NLP model | |
| nlp = get_nlp_model() | |
| # Process claim with NLP | |
| doc = nlp(claim) | |
| # Components to extract | |
| important_components = [] | |
| # 1. Extract all named entities as highest priority | |
| entities = [ent.text for ent in doc.ents] | |
| important_components.extend(entities) | |
| # 2. Extract key proper nouns if not already captured in entities | |
| for token in doc: | |
| if token.pos_ == "PROPN" and token.text not in important_components: | |
| important_components.append(token.text) | |
| # 3. Extract main verbs (actions) | |
| verbs = [] | |
| for token in doc: | |
| if token.pos_ == "VERB" and not token.is_stop: | |
| verbs.append(token.text) | |
| # 4. Check for important title terms like "president", "prime minister" | |
| title_terms = ["president", "prime minister", "minister", "chancellor", "premier", | |
| "governor", "mayor", "senator", "CEO", "founder", "director"] | |
| for term in title_terms: | |
| if term in claim.lower(): | |
| # Find the full phrase (e.g., "Canadian Prime Minister") | |
| matches = re.finditer(r'(?i)(?:\w+\s+)*\b' + re.escape(term) + r'\b(?:\s+\w+)*', claim) | |
| for match in matches: | |
| phrase = match.group(0) | |
| if phrase not in important_components: | |
| important_components.append(phrase) | |
| # 5. Add important temporal indicators | |
| temporal_terms = ["today", "yesterday", "recently", "just", "now", | |
| "current", "currently", "latest", "new", "week", | |
| "month", "year", "announces", "announced", "introduces", | |
| "introduced", "launches", "launched", "releases", | |
| "released", "rolls out", "rolled out", "presents", "presented", "unveils", "unveiled", | |
| "starts", "started", "begins", "began", "initiates", "initiated", "anymore" | |
| ] | |
| # Add significant temporal context | |
| temporal_context = [] | |
| for term in temporal_terms: | |
| if term in claim.lower(): | |
| temporal_matches = re.finditer(r'(?i)(?:\w+\s+){0,2}\b' + re.escape(term) + r'\b(?:\s+\w+){0,2}', claim) | |
| for match in temporal_matches: | |
| temporal_context.append(match.group(0)) | |
| # 6. Always include negation words as they're critical for meaning | |
| negation_terms = ["not", "no longer", "former", "ex-", "isn't", "aren't", "doesn't", "don't"] | |
| negation_context = [] | |
| for term in negation_terms: | |
| if term in claim.lower(): | |
| # Find the context around the negation (3 words before and after) | |
| neg_matches = re.finditer(r'(?i)(?:\w+\s+){0,3}\b' + re.escape(term) + r'\b(?:\s+\w+){0,3}', claim) | |
| for match in neg_matches: | |
| negation_context.append(match.group(0)) | |
| # Combine all components | |
| all_components = important_components + verbs + temporal_context + negation_context | |
| # Remove duplicates while preserving order | |
| seen = set() | |
| unique_components = [] | |
| for component in all_components: | |
| if component.lower() not in seen: | |
| seen.add(component.lower()) | |
| unique_components.append(component) | |
| # If we have too few components (< 2), use the original claim | |
| if len(unique_components) < 2: | |
| # If the claim is already short (< 10 words), use as is | |
| if len(claim.split()) < 10: | |
| return claim | |
| # Otherwise, use the first 8 words | |
| words = claim.split() | |
| return " ".join(words[:min(8, len(words))]) | |
| # Join components to create shortened claim | |
| # Sort components to maintain approximate original word order | |
| def get_position(comp): | |
| return claim.lower().find(comp.lower()) | |
| unique_components.sort(key=get_position) | |
| shortened_claim = " ".join(unique_components) | |
| # If the shortened claim is still too long, limit to first 10 words | |
| if len(shortened_claim.split()) > 10: | |
| return " ".join(shortened_claim.split()[:10]) | |
| return shortened_claim | |
| except Exception as e: | |
| logger.error(f"Error in shortening claim: {str(e)}") | |
| # Return original claim on error | |
| return claim |