Spaces:

wozwize
/

media-unmasked-api

Sleeping

App Files Files Community

wozwize commited on Feb 25

Commit

55cdb25

1 Parent(s): a9d5552

updating analyzers to return flagged_phrases list for each.

Browse files

updating scraper to maintain article formatting on return to front end.

Files changed (5) hide show

app/routers/analyze.py +5 -42
mediaunmasked/analyzers/bias_analyzer.py +11 -4
mediaunmasked/analyzers/headline_analyzer.py +26 -8
mediaunmasked/analyzers/scoring.py +22 -7
mediaunmasked/scrapers/article_scraper.py +40 -65

app/routers/analyze.py CHANGED Viewed

@@ -53,7 +53,6 @@ class AnalysisResponse(BaseModel):
     bias: str
     bias_score: float
     bias_percentage: float
-    flagged_phrases: List[str]
     media_score: MediaScore
 @router.post("/analyze", response_model=AnalysisResponse)
@@ -109,14 +108,13 @@ async def analyze_article(request: ArticleRequest) -> AnalysisResponse:
             "bias": str(analysis['details']['bias_analysis']['bias']),
             "bias_score": float(analysis['details']['bias_analysis']['bias_score']),
             "bias_percentage": float(analysis['details']['bias_analysis']['bias_percentage']),
-            "flagged_phrases": list(analysis['details']['sentiment_analysis']['flagged_phrases']),
             "media_score": {
                 "media_unmasked_score": float(analysis['media_unmasked_score']),
                 "rating": str(analysis['rating']),
                 "details": {
                     "headline_analysis": {
                         "headline_vs_content_score": float(analysis['details']['headline_analysis']['headline_vs_content_score']),
-                        "contradictory_phrases": analysis['details']['headline_analysis'].get('contradictory_phrases', [])
                     },
                     "sentiment_analysis": {
                         "sentiment": str(analysis['details']['sentiment_analysis']['sentiment']),
@@ -126,10 +124,12 @@ async def analyze_article(request: ArticleRequest) -> AnalysisResponse:
                     "bias_analysis": {
                         "bias": str(analysis['details']['bias_analysis']['bias']),
                         "bias_score": float(analysis['details']['bias_analysis']['bias_score']),
-                        "bias_percentage": float(analysis['details']['bias_analysis']['bias_percentage'])
                     },
                     "evidence_analysis": {
-                        "evidence_based_score": float(analysis['details']['evidence_analysis']['evidence_based_score'])
                     }
                 }
             }
@@ -144,7 +144,6 @@ async def analyze_article(request: ArticleRequest) -> AnalysisResponse:
             'bias': response_dict['bias'],
             'bias_score': response_dict['bias_score'],
             'bias_percentage': response_dict['bias_percentage'],
-            'flagged_phrases': response_dict['flagged_phrases'],
             'media_score': response_dict['media_score']
         }).execute()
@@ -157,39 +156,3 @@ async def analyze_article(request: ArticleRequest) -> AnalysisResponse:
             status_code=500,
             detail=f"Analysis failed: {str(e)}"
         )
-@router.get("/debug")
-async def debug_response():
-    mock_analysis = {
-        "headline": "Test Headline",
-        "content": "Test content",
-        "sentiment": "Neutral",
-        "bias": "Neutral",
-        "bias_score": 0.75,  # Note: 0-1 scale
-        "bias_percentage": 0,
-        "flagged_phrases": ["test phrase"],
-        "media_score": {
-            "media_unmasked_score": 75.5,
-            "rating": "Some Bias Present",
-            "details": {
-                "headline_analysis": {
-                    "headline_vs_content_score": 20,
-                    "contradictory_phrases": ["Sample contradiction"]
-                },
-                "sentiment_analysis": {
-                    "sentiment": "Neutral",
-                    "manipulation_score": 30,
-                    "flagged_phrases": ["Sample manipulative phrase"]
-                },
-                "bias_analysis": {
-                    "bias": "Neutral",
-                    "bias_score": 0.75,
-                    "bias_percentage": 0
-                },
-                "evidence_analysis": {
-                    "evidence_based_score": 80
-                }
-            }
-        }
-    }
-    return AnalysisResponse.parse_obj(mock_analysis)

     bias: str
     bias_score: float
     bias_percentage: float
     media_score: MediaScore
 @router.post("/analyze", response_model=AnalysisResponse)
             "bias": str(analysis['details']['bias_analysis']['bias']),
             "bias_score": float(analysis['details']['bias_analysis']['bias_score']),
             "bias_percentage": float(analysis['details']['bias_analysis']['bias_percentage']),
             "media_score": {
                 "media_unmasked_score": float(analysis['media_unmasked_score']),
                 "rating": str(analysis['rating']),
                 "details": {
                     "headline_analysis": {
                         "headline_vs_content_score": float(analysis['details']['headline_analysis']['headline_vs_content_score']),
+                        "flagged_phrases": analysis['details']['headline_analysis'].get('flagged_phrases', [])
                     },
                     "sentiment_analysis": {
                         "sentiment": str(analysis['details']['sentiment_analysis']['sentiment']),
                     "bias_analysis": {
                         "bias": str(analysis['details']['bias_analysis']['bias']),
                         "bias_score": float(analysis['details']['bias_analysis']['bias_score']),
+                        "bias_percentage": float(analysis['details']['bias_analysis']['bias_percentage']),
+                        "flagged_phrases": list(analysis['details']['bias_analysis']['flagged_phrases'])
                     },
                     "evidence_analysis": {
+                        "evidence_based_score": float(analysis['details']['evidence_analysis']['evidence_based_score']),
+                        "flagged_phrases": list(analysis['details']['evidence_analysis']['flagged_phrases'])
                     }
                 }
             }
             'bias': response_dict['bias'],
             'bias_score': response_dict['bias_score'],
             'bias_percentage': response_dict['bias_percentage'],
             'media_score': response_dict['media_score']
         }).execute()
             status_code=500,
             detail=f"Analysis failed: {str(e)}"
         )

mediaunmasked/analyzers/bias_analyzer.py CHANGED Viewed

@@ -25,16 +25,21 @@ class BiasAnalyzer:
         try:
             text_lower = text.lower()
-            # Count matches
             left_count = sum(1 for word in self.left_keywords if word in text_lower)
             right_count = sum(1 for word in self.right_keywords if word in text_lower)
             total_words = left_count + right_count
             if total_words == 0:
                 return {
                     "bias": "Neutral",
                     "bias_score": 0.0,  # True neutral
-                    "bias_percentage": 0  # Neutral percentage
                 }
             # New bias score formula (-1.0 left, 0.0 neutral, 1.0 right)
@@ -63,7 +68,8 @@ class BiasAnalyzer:
             return {
                 "bias": bias,
                 "bias_score": round(bias_score, 2),  # Keep 2 decimal places
-                "bias_percentage": abs(round(bias_percentage, 1))
             }
         except Exception as e:
@@ -71,5 +77,6 @@ class BiasAnalyzer:
             return {
                 "bias": "Error",
                 "bias_score": 0.0,
-                "bias_percentage": 0
             }

         try:
             text_lower = text.lower()
+            flagged_phrases = []
+            # Count matches and collect flagged phrases
             left_count = sum(1 for word in self.left_keywords if word in text_lower)
+            flagged_phrases.extend([word for word in self.left_keywords if word in text_lower])
             right_count = sum(1 for word in self.right_keywords if word in text_lower)
+            flagged_phrases.extend([word for word in self.right_keywords if word in text_lower])
             total_words = left_count + right_count
             if total_words == 0:
                 return {
                     "bias": "Neutral",
                     "bias_score": 0.0,  # True neutral
+                    "bias_percentage": 0,  # Neutral percentage
+                    "flagged_phrases": []
                 }
             # New bias score formula (-1.0 left, 0.0 neutral, 1.0 right)
             return {
                 "bias": bias,
                 "bias_score": round(bias_score, 2),  # Keep 2 decimal places
+                "bias_percentage": abs(round(bias_percentage, 1)),
+                "flagged_phrases": flagged_phrases
             }
         except Exception as e:
             return {
                 "bias": "Error",
                 "bias_score": 0.0,
+                "bias_percentage": 0,
+                "flagged_phrases": []
             }

mediaunmasked/analyzers/headline_analyzer.py CHANGED Viewed

@@ -3,6 +3,8 @@ from typing import Dict, Any, List
 from transformers import pipeline
 from transformers import AutoTokenizer
 import numpy as np
 logger = logging.getLogger(__name__)
@@ -54,19 +56,35 @@ class HeadlineAnalyzer:
     def _analyze_section(self, headline: str, section: str) -> Dict[str, float]:
         """Analyze a single section of content."""
-        input_text = f"{headline} [SEP] {section}"
-        result = self.nli_pipeline(input_text, top_k=None)
-        # Extract scores
-        scores = {item['label']: item['score'] for item in result}
         logger.info("\nSection Analysis:")
         logger.info("-"*30)
         logger.info(f"Section preview: {section[:100]}...")
         for label, score in scores.items():
             logger.info(f"Label: {label:<12} Score: {score:.3f}")
-        return scores
     def analyze(self, headline: str, content: str) -> Dict[str, Any]:
         """Analyze how well the headline matches the content using an AI model."""
@@ -146,7 +164,7 @@ class HeadlineAnalyzer:
                 "headline_vs_content_score": round(final_score, 1),
                 "entailment_score": round(entailment_score, 2),
                 "contradiction_score": round(contradiction_score, 2),
-                "contradictory_phrases": []
             }
         except Exception as e:

 from transformers import pipeline
 from transformers import AutoTokenizer
 import numpy as np
+import nltk
+from nltk.tokenize import sent_tokenize
 logger = logging.getLogger(__name__)
     def _analyze_section(self, headline: str, section: str) -> Dict[str, float]:
         """Analyze a single section of content."""
+        # Use a more robust method for sentence splitting
+        nltk.download('punkt')
+        sentences = sent_tokenize(section)
+        flagged_phrases = []
+        for sentence in sentences:
+            input_text = f"{headline} [SEP] {sentence}"
+            result = self.nli_pipeline(input_text, top_k=None)
+            scores = {item['label']: item['score'] for item in result}
+            # Log the model output for debugging
+            logger.info(f"Sentence: {sentence}")
+            logger.info(f"Scores: {scores}")
+            # Set the threshold for contradiction to anything higher than 0.1
+            if scores.get('CONTRADICTION', 0) > 0.1:  # Threshold set to > 0.1
+                flagged_phrases.append(sentence)
+        # Adjust the headline_vs_content_score based on contradictions
+        contradiction_penalty = len(flagged_phrases) * 0.1  # Example penalty per contradiction
+        adjusted_score = max(0, scores.get('ENTAILMENT', 0) - contradiction_penalty)
         logger.info("\nSection Analysis:")
         logger.info("-"*30)
         logger.info(f"Section preview: {section[:100]}...")
         for label, score in scores.items():
             logger.info(f"Label: {label:<12} Score: {score:.3f}")
+        return {"scores": scores, "flagged_phrases": flagged_phrases, "adjusted_score": adjusted_score}
     def analyze(self, headline: str, content: str) -> Dict[str, Any]:
         """Analyze how well the headline matches the content using an AI model."""
                 "headline_vs_content_score": round(final_score, 1),
                 "entailment_score": round(entailment_score, 2),
                 "contradiction_score": round(contradiction_score, 2),
+                "contradictory_phrases": scores.get('flagged_phrases', [])
             }
         except Exception as e:

mediaunmasked/analyzers/scoring.py CHANGED Viewed

@@ -75,10 +75,25 @@ class MediaScorer:
                 "media_unmasked_score": round(final_score, 1),
                 "rating": rating,
                 "details": {
-                    "headline_analysis": headline_analysis,
-                    "sentiment_analysis": sentiment_analysis,
-                    "bias_analysis": bias_analysis,
-                    "evidence_analysis": evidence_analysis
                 }
             }
@@ -93,9 +108,9 @@ class MediaScorer:
                 "media_unmasked_score": 0,
                 "rating": "Error",
                 "details": {
-                    "headline_analysis": {"headline_vs_content_score": 0, "contradictory_phrases": []},
                     "sentiment_analysis": {"sentiment": "Error", "manipulation_score": 0, "flagged_phrases": []},
-                    "bias_analysis": {"bias": "Error", "bias_score": 0.0, "bias_percentage": 0},
-                    "evidence_analysis": {"evidence_based_score": 0}
                 }
             }

                 "media_unmasked_score": round(final_score, 1),
                 "rating": rating,
                 "details": {
+                    "headline_analysis": {
+                        "headline_vs_content_score": headline_analysis["headline_vs_content_score"],
+                        "flagged_phrases": headline_analysis.get("flagged_phrases", [])
+                    },
+                    "sentiment_analysis": {
+                        "sentiment": sentiment_analysis["sentiment"],
+                        "manipulation_score": sentiment_analysis["manipulation_score"],
+                        "flagged_phrases": sentiment_analysis.get("flagged_phrases", [])
+                    },
+                    "bias_analysis": {
+                        "bias": bias_analysis["bias"],
+                        "bias_score": bias_analysis["bias_score"],
+                        "bias_percentage": bias_analysis["bias_percentage"],
+                        "flagged_phrases": bias_analysis.get("flagged_phrases", [])
+                    },
+                    "evidence_analysis": {
+                        "evidence_based_score": evidence_analysis["evidence_based_score"],
+                        "flagged_phrases": evidence_analysis.get("flagged_phrases", [])
+                    }
                 }
             }
                 "media_unmasked_score": 0,
                 "rating": "Error",
                 "details": {
+                    "headline_analysis": {"headline_vs_content_score": 0, "flagged_phrases": []},
                     "sentiment_analysis": {"sentiment": "Error", "manipulation_score": 0, "flagged_phrases": []},
+                    "bias_analysis": {"bias": "Error", "bias_score": 0.0, "bias_percentage": 0, "flagged_phrases": []},
+                    "evidence_analysis": {"evidence_based_score": 0, "flagged_phrases": []}
                 }
             }

mediaunmasked/scrapers/article_scraper.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Dict, Optional, List
 import logging
 from urllib.parse import urlparse
 import requests
@@ -25,17 +25,15 @@ class ArticleScraper:
             response = self.session.get(url)
             response.raise_for_status()
             return response.text
         except Exception as e:
             self.logger.error(f"Error fetching {url}: {str(e)}")
             return None
     def _process_element(self, element) -> str:
-        """Process an HTML element while preserving its structure and formatting."""
         if isinstance(element, NavigableString):
             return str(element)
-        # Handle different types of elements
         tag_name = element.name
         if tag_name in ['p', 'div']:
@@ -64,90 +62,63 @@ class ArticleScraper:
         elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
             level = int(tag_name[1])
-            prefix = '#' * (level + 1)  # Add one more # to match test expectations
             return f'\n\n{prefix} ' + ''.join(self._process_element(child) for child in element.children).strip() + '\n'
-        # For other elements, just process their children
         return ''.join(self._process_element(child) for child in element.children)
     def _extract_content(self, container) -> str:
         """Extract and format content from a container element."""
         if not container:
             return ''
-        # Remove unwanted elements
         for unwanted in container.find_all(['script', 'style', 'iframe', 'aside']):
             unwanted.decompose()
-        # Process the container
         content = self._process_element(container)
-        # Clean up extra whitespace and newlines
         content = '\n'.join(line.strip() for line in content.split('\n'))
         content = '\n'.join(filter(None, content.split('\n')))
         return content.strip()
-    def _extract_article(self, soup: BeautifulSoup, domain: str) -> Dict[str, str]:
-        """Extract content from any article, with special handling for known domains."""
         try:
-            # Find headline - try domain-specific selectors first, then fallback to generic
-            headline = None
-            headline_selectors = {
-                'politifact.com': ['h1.article__title'],
-                'snopes.com': ['header h1', 'article h1']
-            }
-            # Try domain-specific headline selectors
-            if domain in headline_selectors:
-                for selector in headline_selectors[domain]:
-                    headline = soup.select_one(selector)
-                    if headline:
-                        break
-            # Fallback to any h1 if no domain-specific headline found
-            if not headline:
-                headline = soup.find('h1')
-            headline_text = headline.get_text().strip() if headline else "No headline found"
-            self.logger.info(f"Found headline: {headline_text}")
-            # Find content - try domain-specific selectors first, then fallback to generic
-            content_div = None
-            content_selectors = {
-                'politifact.com': ['article.article', '.article__text', '.m-textblock'],
-                'snopes.com': ['article']
-            }
-            # Try domain-specific content selectors
-            if domain in content_selectors:
-                for selector in content_selectors[domain]:
-                    content_div = soup.select_one(selector)
-                    if content_div:
-                        break
-            # Fallback to generic content selectors
-            if not content_div:
-                for selector in ['article', 'main', '.content', '.article-content']:
-                    content_div = soup.select_one(selector)
-                    if content_div:
-                        break
-            content = self._extract_content(content_div) if content_div else "No content found"
-            if not content:
-                self.logger.warning("No content found in article")
-                self.logger.debug(f"Domain: {domain}")
-            return {"headline": headline_text, "content": content}
         except Exception as e:
-            self.logger.error(f"Error extracting article content: {str(e)}")
             return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}
     def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
         """
-        Main function to scrape fact-checking articles.
         Returns a dictionary with headline and content.
         """
         html_content = self._fetch_page(url)
@@ -159,4 +130,8 @@ class ArticleScraper:
         domain = self._get_domain(url)
         self.logger.info(f"Scraping article from domain: {domain}")
-        return self._extract_article(soup, domain)

+from typing import Dict, Optional
 import logging
 from urllib.parse import urlparse
 import requests
             response = self.session.get(url)
             response.raise_for_status()
             return response.text
         except Exception as e:
             self.logger.error(f"Error fetching {url}: {str(e)}")
             return None
     def _process_element(self, element) -> str:
+        """Process an HTML element while preserving structure and formatting."""
         if isinstance(element, NavigableString):
             return str(element)
         tag_name = element.name
         if tag_name in ['p', 'div']:
         elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
             level = int(tag_name[1])
+            prefix = '#' * (level + 1)  # Add one more # for clarity
             return f'\n\n{prefix} ' + ''.join(self._process_element(child) for child in element.children).strip() + '\n'
         return ''.join(self._process_element(child) for child in element.children)
     def _extract_content(self, container) -> str:
         """Extract and format content from a container element."""
         if not container:
             return ''
         for unwanted in container.find_all(['script', 'style', 'iframe', 'aside']):
             unwanted.decompose()
         content = self._process_element(container)
         content = '\n'.join(line.strip() for line in content.split('\n'))
         content = '\n'.join(filter(None, content.split('\n')))
         return content.strip()
+    def _extract_politifact(self, soup: BeautifulSoup) -> Dict[str, str]:
+        """Extract content from PolitiFact articles."""
         try:
+            headline = soup.find('h1', class_='article__title') or soup.find('h1')
+            headline = headline.get_text(strip=True) if headline else "No headline found"
+            self.logger.info(f"Found headline: {headline}")
+            content_div = soup.find('article', class_='article') or soup.select_one('.article__text, .m-textblock')
+            content = self._extract_content(content_div) if content_div else "No content found"
+            return {"headline": headline, "content": content}
         except Exception as e:
+            self.logger.error(f"Error extracting PolitiFact content: {str(e)}")
             return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}
+    def _extract_generic(self, soup: BeautifulSoup, domain: str) -> Dict[str, str]:
+        """Fallback extraction method for unknown domains."""
+        headline = soup.find('h1')
+        headline_text = headline.get_text().strip() if headline else "No headline found"
+        content_div = None
+        common_selectors = ['article', 'main', '.content', '.article-content']
+        for selector in common_selectors:
+            content_div = soup.select_one(selector)
+            if content_div:
+                break
+        content = self._extract_content(content_div) if content_div else "No content found"
+        return {"headline": headline_text, "content": content}
     def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
         """
+        Main function to scrape articles while maintaining structure.
         Returns a dictionary with headline and content.
         """
         html_content = self._fetch_page(url)
         domain = self._get_domain(url)
         self.logger.info(f"Scraping article from domain: {domain}")
+        if 'politifact.com' in domain:
+            return self._extract_politifact(soup)
+        return self._extract_generic(soup, domain)