Spaces:
Sleeping
Sleeping
updating analyzers to return flagged_phrases list for each.
Browse filesupdating scraper to maintain article formatting on return to front end.
app/routers/analyze.py
CHANGED
|
@@ -53,7 +53,6 @@ class AnalysisResponse(BaseModel):
|
|
| 53 |
bias: str
|
| 54 |
bias_score: float
|
| 55 |
bias_percentage: float
|
| 56 |
-
flagged_phrases: List[str]
|
| 57 |
media_score: MediaScore
|
| 58 |
|
| 59 |
@router.post("/analyze", response_model=AnalysisResponse)
|
|
@@ -109,14 +108,13 @@ async def analyze_article(request: ArticleRequest) -> AnalysisResponse:
|
|
| 109 |
"bias": str(analysis['details']['bias_analysis']['bias']),
|
| 110 |
"bias_score": float(analysis['details']['bias_analysis']['bias_score']),
|
| 111 |
"bias_percentage": float(analysis['details']['bias_analysis']['bias_percentage']),
|
| 112 |
-
"flagged_phrases": list(analysis['details']['sentiment_analysis']['flagged_phrases']),
|
| 113 |
"media_score": {
|
| 114 |
"media_unmasked_score": float(analysis['media_unmasked_score']),
|
| 115 |
"rating": str(analysis['rating']),
|
| 116 |
"details": {
|
| 117 |
"headline_analysis": {
|
| 118 |
"headline_vs_content_score": float(analysis['details']['headline_analysis']['headline_vs_content_score']),
|
| 119 |
-
"
|
| 120 |
},
|
| 121 |
"sentiment_analysis": {
|
| 122 |
"sentiment": str(analysis['details']['sentiment_analysis']['sentiment']),
|
|
@@ -126,10 +124,12 @@ async def analyze_article(request: ArticleRequest) -> AnalysisResponse:
|
|
| 126 |
"bias_analysis": {
|
| 127 |
"bias": str(analysis['details']['bias_analysis']['bias']),
|
| 128 |
"bias_score": float(analysis['details']['bias_analysis']['bias_score']),
|
| 129 |
-
"bias_percentage": float(analysis['details']['bias_analysis']['bias_percentage'])
|
|
|
|
| 130 |
},
|
| 131 |
"evidence_analysis": {
|
| 132 |
-
"evidence_based_score": float(analysis['details']['evidence_analysis']['evidence_based_score'])
|
|
|
|
| 133 |
}
|
| 134 |
}
|
| 135 |
}
|
|
@@ -144,7 +144,6 @@ async def analyze_article(request: ArticleRequest) -> AnalysisResponse:
|
|
| 144 |
'bias': response_dict['bias'],
|
| 145 |
'bias_score': response_dict['bias_score'],
|
| 146 |
'bias_percentage': response_dict['bias_percentage'],
|
| 147 |
-
'flagged_phrases': response_dict['flagged_phrases'],
|
| 148 |
'media_score': response_dict['media_score']
|
| 149 |
}).execute()
|
| 150 |
|
|
@@ -157,39 +156,3 @@ async def analyze_article(request: ArticleRequest) -> AnalysisResponse:
|
|
| 157 |
status_code=500,
|
| 158 |
detail=f"Analysis failed: {str(e)}"
|
| 159 |
)
|
| 160 |
-
|
| 161 |
-
@router.get("/debug")
|
| 162 |
-
async def debug_response():
|
| 163 |
-
mock_analysis = {
|
| 164 |
-
"headline": "Test Headline",
|
| 165 |
-
"content": "Test content",
|
| 166 |
-
"sentiment": "Neutral",
|
| 167 |
-
"bias": "Neutral",
|
| 168 |
-
"bias_score": 0.75, # Note: 0-1 scale
|
| 169 |
-
"bias_percentage": 0,
|
| 170 |
-
"flagged_phrases": ["test phrase"],
|
| 171 |
-
"media_score": {
|
| 172 |
-
"media_unmasked_score": 75.5,
|
| 173 |
-
"rating": "Some Bias Present",
|
| 174 |
-
"details": {
|
| 175 |
-
"headline_analysis": {
|
| 176 |
-
"headline_vs_content_score": 20,
|
| 177 |
-
"contradictory_phrases": ["Sample contradiction"]
|
| 178 |
-
},
|
| 179 |
-
"sentiment_analysis": {
|
| 180 |
-
"sentiment": "Neutral",
|
| 181 |
-
"manipulation_score": 30,
|
| 182 |
-
"flagged_phrases": ["Sample manipulative phrase"]
|
| 183 |
-
},
|
| 184 |
-
"bias_analysis": {
|
| 185 |
-
"bias": "Neutral",
|
| 186 |
-
"bias_score": 0.75,
|
| 187 |
-
"bias_percentage": 0
|
| 188 |
-
},
|
| 189 |
-
"evidence_analysis": {
|
| 190 |
-
"evidence_based_score": 80
|
| 191 |
-
}
|
| 192 |
-
}
|
| 193 |
-
}
|
| 194 |
-
}
|
| 195 |
-
return AnalysisResponse.parse_obj(mock_analysis)
|
|
|
|
| 53 |
bias: str
|
| 54 |
bias_score: float
|
| 55 |
bias_percentage: float
|
|
|
|
| 56 |
media_score: MediaScore
|
| 57 |
|
| 58 |
@router.post("/analyze", response_model=AnalysisResponse)
|
|
|
|
| 108 |
"bias": str(analysis['details']['bias_analysis']['bias']),
|
| 109 |
"bias_score": float(analysis['details']['bias_analysis']['bias_score']),
|
| 110 |
"bias_percentage": float(analysis['details']['bias_analysis']['bias_percentage']),
|
|
|
|
| 111 |
"media_score": {
|
| 112 |
"media_unmasked_score": float(analysis['media_unmasked_score']),
|
| 113 |
"rating": str(analysis['rating']),
|
| 114 |
"details": {
|
| 115 |
"headline_analysis": {
|
| 116 |
"headline_vs_content_score": float(analysis['details']['headline_analysis']['headline_vs_content_score']),
|
| 117 |
+
"flagged_phrases": analysis['details']['headline_analysis'].get('flagged_phrases', [])
|
| 118 |
},
|
| 119 |
"sentiment_analysis": {
|
| 120 |
"sentiment": str(analysis['details']['sentiment_analysis']['sentiment']),
|
|
|
|
| 124 |
"bias_analysis": {
|
| 125 |
"bias": str(analysis['details']['bias_analysis']['bias']),
|
| 126 |
"bias_score": float(analysis['details']['bias_analysis']['bias_score']),
|
| 127 |
+
"bias_percentage": float(analysis['details']['bias_analysis']['bias_percentage']),
|
| 128 |
+
"flagged_phrases": list(analysis['details']['bias_analysis']['flagged_phrases'])
|
| 129 |
},
|
| 130 |
"evidence_analysis": {
|
| 131 |
+
"evidence_based_score": float(analysis['details']['evidence_analysis']['evidence_based_score']),
|
| 132 |
+
"flagged_phrases": list(analysis['details']['evidence_analysis']['flagged_phrases'])
|
| 133 |
}
|
| 134 |
}
|
| 135 |
}
|
|
|
|
| 144 |
'bias': response_dict['bias'],
|
| 145 |
'bias_score': response_dict['bias_score'],
|
| 146 |
'bias_percentage': response_dict['bias_percentage'],
|
|
|
|
| 147 |
'media_score': response_dict['media_score']
|
| 148 |
}).execute()
|
| 149 |
|
|
|
|
| 156 |
status_code=500,
|
| 157 |
detail=f"Analysis failed: {str(e)}"
|
| 158 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mediaunmasked/analyzers/bias_analyzer.py
CHANGED
|
@@ -25,16 +25,21 @@ class BiasAnalyzer:
|
|
| 25 |
try:
|
| 26 |
text_lower = text.lower()
|
| 27 |
|
| 28 |
-
|
|
|
|
|
|
|
| 29 |
left_count = sum(1 for word in self.left_keywords if word in text_lower)
|
|
|
|
| 30 |
right_count = sum(1 for word in self.right_keywords if word in text_lower)
|
|
|
|
| 31 |
|
| 32 |
total_words = left_count + right_count
|
| 33 |
if total_words == 0:
|
| 34 |
return {
|
| 35 |
"bias": "Neutral",
|
| 36 |
"bias_score": 0.0, # True neutral
|
| 37 |
-
"bias_percentage": 0 # Neutral percentage
|
|
|
|
| 38 |
}
|
| 39 |
|
| 40 |
# New bias score formula (-1.0 left, 0.0 neutral, 1.0 right)
|
|
@@ -63,7 +68,8 @@ class BiasAnalyzer:
|
|
| 63 |
return {
|
| 64 |
"bias": bias,
|
| 65 |
"bias_score": round(bias_score, 2), # Keep 2 decimal places
|
| 66 |
-
"bias_percentage": abs(round(bias_percentage, 1))
|
|
|
|
| 67 |
}
|
| 68 |
|
| 69 |
except Exception as e:
|
|
@@ -71,5 +77,6 @@ class BiasAnalyzer:
|
|
| 71 |
return {
|
| 72 |
"bias": "Error",
|
| 73 |
"bias_score": 0.0,
|
| 74 |
-
"bias_percentage": 0
|
|
|
|
| 75 |
}
|
|
|
|
| 25 |
try:
|
| 26 |
text_lower = text.lower()
|
| 27 |
|
| 28 |
+
flagged_phrases = []
|
| 29 |
+
|
| 30 |
+
# Count matches and collect flagged phrases
|
| 31 |
left_count = sum(1 for word in self.left_keywords if word in text_lower)
|
| 32 |
+
flagged_phrases.extend([word for word in self.left_keywords if word in text_lower])
|
| 33 |
right_count = sum(1 for word in self.right_keywords if word in text_lower)
|
| 34 |
+
flagged_phrases.extend([word for word in self.right_keywords if word in text_lower])
|
| 35 |
|
| 36 |
total_words = left_count + right_count
|
| 37 |
if total_words == 0:
|
| 38 |
return {
|
| 39 |
"bias": "Neutral",
|
| 40 |
"bias_score": 0.0, # True neutral
|
| 41 |
+
"bias_percentage": 0, # Neutral percentage
|
| 42 |
+
"flagged_phrases": []
|
| 43 |
}
|
| 44 |
|
| 45 |
# New bias score formula (-1.0 left, 0.0 neutral, 1.0 right)
|
|
|
|
| 68 |
return {
|
| 69 |
"bias": bias,
|
| 70 |
"bias_score": round(bias_score, 2), # Keep 2 decimal places
|
| 71 |
+
"bias_percentage": abs(round(bias_percentage, 1)),
|
| 72 |
+
"flagged_phrases": flagged_phrases
|
| 73 |
}
|
| 74 |
|
| 75 |
except Exception as e:
|
|
|
|
| 77 |
return {
|
| 78 |
"bias": "Error",
|
| 79 |
"bias_score": 0.0,
|
| 80 |
+
"bias_percentage": 0,
|
| 81 |
+
"flagged_phrases": []
|
| 82 |
}
|
mediaunmasked/analyzers/headline_analyzer.py
CHANGED
|
@@ -3,6 +3,8 @@ from typing import Dict, Any, List
|
|
| 3 |
from transformers import pipeline
|
| 4 |
from transformers import AutoTokenizer
|
| 5 |
import numpy as np
|
|
|
|
|
|
|
| 6 |
|
| 7 |
logger = logging.getLogger(__name__)
|
| 8 |
|
|
@@ -54,19 +56,35 @@ class HeadlineAnalyzer:
|
|
| 54 |
|
| 55 |
def _analyze_section(self, headline: str, section: str) -> Dict[str, float]:
|
| 56 |
"""Analyze a single section of content."""
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
logger.info("\nSection Analysis:")
|
| 64 |
logger.info("-"*30)
|
| 65 |
logger.info(f"Section preview: {section[:100]}...")
|
| 66 |
for label, score in scores.items():
|
| 67 |
logger.info(f"Label: {label:<12} Score: {score:.3f}")
|
| 68 |
|
| 69 |
-
return scores
|
| 70 |
|
| 71 |
def analyze(self, headline: str, content: str) -> Dict[str, Any]:
|
| 72 |
"""Analyze how well the headline matches the content using an AI model."""
|
|
@@ -146,7 +164,7 @@ class HeadlineAnalyzer:
|
|
| 146 |
"headline_vs_content_score": round(final_score, 1),
|
| 147 |
"entailment_score": round(entailment_score, 2),
|
| 148 |
"contradiction_score": round(contradiction_score, 2),
|
| 149 |
-
"contradictory_phrases": []
|
| 150 |
}
|
| 151 |
|
| 152 |
except Exception as e:
|
|
|
|
| 3 |
from transformers import pipeline
|
| 4 |
from transformers import AutoTokenizer
|
| 5 |
import numpy as np
|
| 6 |
+
import nltk
|
| 7 |
+
from nltk.tokenize import sent_tokenize
|
| 8 |
|
| 9 |
logger = logging.getLogger(__name__)
|
| 10 |
|
|
|
|
| 56 |
|
| 57 |
def _analyze_section(self, headline: str, section: str) -> Dict[str, float]:
|
| 58 |
"""Analyze a single section of content."""
|
| 59 |
+
# Use a more robust method for sentence splitting
|
| 60 |
+
nltk.download('punkt')
|
| 61 |
+
sentences = sent_tokenize(section)
|
| 62 |
+
|
| 63 |
+
flagged_phrases = []
|
| 64 |
+
for sentence in sentences:
|
| 65 |
+
input_text = f"{headline} [SEP] {sentence}"
|
| 66 |
+
result = self.nli_pipeline(input_text, top_k=None)
|
| 67 |
+
scores = {item['label']: item['score'] for item in result}
|
| 68 |
+
|
| 69 |
+
# Log the model output for debugging
|
| 70 |
+
logger.info(f"Sentence: {sentence}")
|
| 71 |
+
logger.info(f"Scores: {scores}")
|
| 72 |
+
|
| 73 |
+
# Set the threshold for contradiction to anything higher than 0.1
|
| 74 |
+
if scores.get('CONTRADICTION', 0) > 0.1: # Threshold set to > 0.1
|
| 75 |
+
flagged_phrases.append(sentence)
|
| 76 |
+
|
| 77 |
+
# Adjust the headline_vs_content_score based on contradictions
|
| 78 |
+
contradiction_penalty = len(flagged_phrases) * 0.1 # Example penalty per contradiction
|
| 79 |
+
adjusted_score = max(0, scores.get('ENTAILMENT', 0) - contradiction_penalty)
|
| 80 |
+
|
| 81 |
logger.info("\nSection Analysis:")
|
| 82 |
logger.info("-"*30)
|
| 83 |
logger.info(f"Section preview: {section[:100]}...")
|
| 84 |
for label, score in scores.items():
|
| 85 |
logger.info(f"Label: {label:<12} Score: {score:.3f}")
|
| 86 |
|
| 87 |
+
return {"scores": scores, "flagged_phrases": flagged_phrases, "adjusted_score": adjusted_score}
|
| 88 |
|
| 89 |
def analyze(self, headline: str, content: str) -> Dict[str, Any]:
|
| 90 |
"""Analyze how well the headline matches the content using an AI model."""
|
|
|
|
| 164 |
"headline_vs_content_score": round(final_score, 1),
|
| 165 |
"entailment_score": round(entailment_score, 2),
|
| 166 |
"contradiction_score": round(contradiction_score, 2),
|
| 167 |
+
"contradictory_phrases": scores.get('flagged_phrases', [])
|
| 168 |
}
|
| 169 |
|
| 170 |
except Exception as e:
|
mediaunmasked/analyzers/scoring.py
CHANGED
|
@@ -75,10 +75,25 @@ class MediaScorer:
|
|
| 75 |
"media_unmasked_score": round(final_score, 1),
|
| 76 |
"rating": rating,
|
| 77 |
"details": {
|
| 78 |
-
"headline_analysis":
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
}
|
| 83 |
}
|
| 84 |
|
|
@@ -93,9 +108,9 @@ class MediaScorer:
|
|
| 93 |
"media_unmasked_score": 0,
|
| 94 |
"rating": "Error",
|
| 95 |
"details": {
|
| 96 |
-
"headline_analysis": {"headline_vs_content_score": 0, "
|
| 97 |
"sentiment_analysis": {"sentiment": "Error", "manipulation_score": 0, "flagged_phrases": []},
|
| 98 |
-
"bias_analysis": {"bias": "Error", "bias_score": 0.0, "bias_percentage": 0},
|
| 99 |
-
"evidence_analysis": {"evidence_based_score": 0}
|
| 100 |
}
|
| 101 |
}
|
|
|
|
| 75 |
"media_unmasked_score": round(final_score, 1),
|
| 76 |
"rating": rating,
|
| 77 |
"details": {
|
| 78 |
+
"headline_analysis": {
|
| 79 |
+
"headline_vs_content_score": headline_analysis["headline_vs_content_score"],
|
| 80 |
+
"flagged_phrases": headline_analysis.get("flagged_phrases", [])
|
| 81 |
+
},
|
| 82 |
+
"sentiment_analysis": {
|
| 83 |
+
"sentiment": sentiment_analysis["sentiment"],
|
| 84 |
+
"manipulation_score": sentiment_analysis["manipulation_score"],
|
| 85 |
+
"flagged_phrases": sentiment_analysis.get("flagged_phrases", [])
|
| 86 |
+
},
|
| 87 |
+
"bias_analysis": {
|
| 88 |
+
"bias": bias_analysis["bias"],
|
| 89 |
+
"bias_score": bias_analysis["bias_score"],
|
| 90 |
+
"bias_percentage": bias_analysis["bias_percentage"],
|
| 91 |
+
"flagged_phrases": bias_analysis.get("flagged_phrases", [])
|
| 92 |
+
},
|
| 93 |
+
"evidence_analysis": {
|
| 94 |
+
"evidence_based_score": evidence_analysis["evidence_based_score"],
|
| 95 |
+
"flagged_phrases": evidence_analysis.get("flagged_phrases", [])
|
| 96 |
+
}
|
| 97 |
}
|
| 98 |
}
|
| 99 |
|
|
|
|
| 108 |
"media_unmasked_score": 0,
|
| 109 |
"rating": "Error",
|
| 110 |
"details": {
|
| 111 |
+
"headline_analysis": {"headline_vs_content_score": 0, "flagged_phrases": []},
|
| 112 |
"sentiment_analysis": {"sentiment": "Error", "manipulation_score": 0, "flagged_phrases": []},
|
| 113 |
+
"bias_analysis": {"bias": "Error", "bias_score": 0.0, "bias_percentage": 0, "flagged_phrases": []},
|
| 114 |
+
"evidence_analysis": {"evidence_based_score": 0, "flagged_phrases": []}
|
| 115 |
}
|
| 116 |
}
|
mediaunmasked/scrapers/article_scraper.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from typing import Dict, Optional
|
| 2 |
import logging
|
| 3 |
from urllib.parse import urlparse
|
| 4 |
import requests
|
|
@@ -25,17 +25,15 @@ class ArticleScraper:
|
|
| 25 |
response = self.session.get(url)
|
| 26 |
response.raise_for_status()
|
| 27 |
return response.text
|
| 28 |
-
|
| 29 |
except Exception as e:
|
| 30 |
self.logger.error(f"Error fetching {url}: {str(e)}")
|
| 31 |
return None
|
| 32 |
|
| 33 |
def _process_element(self, element) -> str:
|
| 34 |
-
"""Process an HTML element while preserving
|
| 35 |
if isinstance(element, NavigableString):
|
| 36 |
return str(element)
|
| 37 |
-
|
| 38 |
-
# Handle different types of elements
|
| 39 |
tag_name = element.name
|
| 40 |
|
| 41 |
if tag_name in ['p', 'div']:
|
|
@@ -64,90 +62,63 @@ class ArticleScraper:
|
|
| 64 |
|
| 65 |
elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
| 66 |
level = int(tag_name[1])
|
| 67 |
-
prefix = '#' * (level + 1) # Add one more #
|
| 68 |
return f'\n\n{prefix} ' + ''.join(self._process_element(child) for child in element.children).strip() + '\n'
|
| 69 |
|
| 70 |
-
# For other elements, just process their children
|
| 71 |
return ''.join(self._process_element(child) for child in element.children)
|
| 72 |
|
| 73 |
def _extract_content(self, container) -> str:
|
| 74 |
"""Extract and format content from a container element."""
|
| 75 |
if not container:
|
| 76 |
return ''
|
| 77 |
-
|
| 78 |
-
# Remove unwanted elements
|
| 79 |
for unwanted in container.find_all(['script', 'style', 'iframe', 'aside']):
|
| 80 |
unwanted.decompose()
|
| 81 |
-
|
| 82 |
-
# Process the container
|
| 83 |
content = self._process_element(container)
|
| 84 |
|
| 85 |
-
# Clean up extra whitespace and newlines
|
| 86 |
content = '\n'.join(line.strip() for line in content.split('\n'))
|
| 87 |
content = '\n'.join(filter(None, content.split('\n')))
|
| 88 |
|
| 89 |
return content.strip()
|
| 90 |
|
| 91 |
-
def
|
| 92 |
-
"""Extract content from
|
| 93 |
try:
|
| 94 |
-
|
| 95 |
-
headline =
|
| 96 |
-
headline_selectors = {
|
| 97 |
-
'politifact.com': ['h1.article__title'],
|
| 98 |
-
'snopes.com': ['header h1', 'article h1']
|
| 99 |
-
}
|
| 100 |
-
|
| 101 |
-
# Try domain-specific headline selectors
|
| 102 |
-
if domain in headline_selectors:
|
| 103 |
-
for selector in headline_selectors[domain]:
|
| 104 |
-
headline = soup.select_one(selector)
|
| 105 |
-
if headline:
|
| 106 |
-
break
|
| 107 |
-
|
| 108 |
-
# Fallback to any h1 if no domain-specific headline found
|
| 109 |
-
if not headline:
|
| 110 |
-
headline = soup.find('h1')
|
| 111 |
-
|
| 112 |
-
headline_text = headline.get_text().strip() if headline else "No headline found"
|
| 113 |
-
self.logger.info(f"Found headline: {headline_text}")
|
| 114 |
-
|
| 115 |
-
# Find content - try domain-specific selectors first, then fallback to generic
|
| 116 |
-
content_div = None
|
| 117 |
-
content_selectors = {
|
| 118 |
-
'politifact.com': ['article.article', '.article__text', '.m-textblock'],
|
| 119 |
-
'snopes.com': ['article']
|
| 120 |
-
}
|
| 121 |
-
|
| 122 |
-
# Try domain-specific content selectors
|
| 123 |
-
if domain in content_selectors:
|
| 124 |
-
for selector in content_selectors[domain]:
|
| 125 |
-
content_div = soup.select_one(selector)
|
| 126 |
-
if content_div:
|
| 127 |
-
break
|
| 128 |
-
|
| 129 |
-
# Fallback to generic content selectors
|
| 130 |
-
if not content_div:
|
| 131 |
-
for selector in ['article', 'main', '.content', '.article-content']:
|
| 132 |
-
content_div = soup.select_one(selector)
|
| 133 |
-
if content_div:
|
| 134 |
-
break
|
| 135 |
-
|
| 136 |
-
content = self._extract_content(content_div) if content_div else "No content found"
|
| 137 |
|
| 138 |
-
|
| 139 |
-
self.logger.warning("No content found in article")
|
| 140 |
-
self.logger.debug(f"Domain: {domain}")
|
| 141 |
-
|
| 142 |
-
return {"headline": headline_text, "content": content}
|
| 143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
except Exception as e:
|
| 145 |
-
self.logger.error(f"Error extracting
|
| 146 |
return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}
|
| 147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
|
| 149 |
"""
|
| 150 |
-
Main function to scrape
|
| 151 |
Returns a dictionary with headline and content.
|
| 152 |
"""
|
| 153 |
html_content = self._fetch_page(url)
|
|
@@ -159,4 +130,8 @@ class ArticleScraper:
|
|
| 159 |
domain = self._get_domain(url)
|
| 160 |
|
| 161 |
self.logger.info(f"Scraping article from domain: {domain}")
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Dict, Optional
|
| 2 |
import logging
|
| 3 |
from urllib.parse import urlparse
|
| 4 |
import requests
|
|
|
|
| 25 |
response = self.session.get(url)
|
| 26 |
response.raise_for_status()
|
| 27 |
return response.text
|
|
|
|
| 28 |
except Exception as e:
|
| 29 |
self.logger.error(f"Error fetching {url}: {str(e)}")
|
| 30 |
return None
|
| 31 |
|
| 32 |
def _process_element(self, element) -> str:
|
| 33 |
+
"""Process an HTML element while preserving structure and formatting."""
|
| 34 |
if isinstance(element, NavigableString):
|
| 35 |
return str(element)
|
| 36 |
+
|
|
|
|
| 37 |
tag_name = element.name
|
| 38 |
|
| 39 |
if tag_name in ['p', 'div']:
|
|
|
|
| 62 |
|
| 63 |
elif tag_name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
| 64 |
level = int(tag_name[1])
|
| 65 |
+
prefix = '#' * (level + 1) # Add one more # for clarity
|
| 66 |
return f'\n\n{prefix} ' + ''.join(self._process_element(child) for child in element.children).strip() + '\n'
|
| 67 |
|
|
|
|
| 68 |
return ''.join(self._process_element(child) for child in element.children)
|
| 69 |
|
| 70 |
def _extract_content(self, container) -> str:
|
| 71 |
"""Extract and format content from a container element."""
|
| 72 |
if not container:
|
| 73 |
return ''
|
| 74 |
+
|
|
|
|
| 75 |
for unwanted in container.find_all(['script', 'style', 'iframe', 'aside']):
|
| 76 |
unwanted.decompose()
|
| 77 |
+
|
|
|
|
| 78 |
content = self._process_element(container)
|
| 79 |
|
|
|
|
| 80 |
content = '\n'.join(line.strip() for line in content.split('\n'))
|
| 81 |
content = '\n'.join(filter(None, content.split('\n')))
|
| 82 |
|
| 83 |
return content.strip()
|
| 84 |
|
| 85 |
+
def _extract_politifact(self, soup: BeautifulSoup) -> Dict[str, str]:
|
| 86 |
+
"""Extract content from PolitiFact articles."""
|
| 87 |
try:
|
| 88 |
+
headline = soup.find('h1', class_='article__title') or soup.find('h1')
|
| 89 |
+
headline = headline.get_text(strip=True) if headline else "No headline found"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
+
self.logger.info(f"Found headline: {headline}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
+
content_div = soup.find('article', class_='article') or soup.select_one('.article__text, .m-textblock')
|
| 94 |
+
content = self._extract_content(content_div) if content_div else "No content found"
|
| 95 |
+
|
| 96 |
+
return {"headline": headline, "content": content}
|
| 97 |
+
|
| 98 |
except Exception as e:
|
| 99 |
+
self.logger.error(f"Error extracting PolitiFact content: {str(e)}")
|
| 100 |
return {"headline": "Error", "content": f"Failed to extract content: {str(e)}"}
|
| 101 |
|
| 102 |
+
def _extract_generic(self, soup: BeautifulSoup, domain: str) -> Dict[str, str]:
|
| 103 |
+
"""Fallback extraction method for unknown domains."""
|
| 104 |
+
headline = soup.find('h1')
|
| 105 |
+
headline_text = headline.get_text().strip() if headline else "No headline found"
|
| 106 |
+
|
| 107 |
+
content_div = None
|
| 108 |
+
common_selectors = ['article', 'main', '.content', '.article-content']
|
| 109 |
+
|
| 110 |
+
for selector in common_selectors:
|
| 111 |
+
content_div = soup.select_one(selector)
|
| 112 |
+
if content_div:
|
| 113 |
+
break
|
| 114 |
+
|
| 115 |
+
content = self._extract_content(content_div) if content_div else "No content found"
|
| 116 |
+
|
| 117 |
+
return {"headline": headline_text, "content": content}
|
| 118 |
+
|
| 119 |
def scrape_article(self, url: str) -> Optional[Dict[str, str]]:
|
| 120 |
"""
|
| 121 |
+
Main function to scrape articles while maintaining structure.
|
| 122 |
Returns a dictionary with headline and content.
|
| 123 |
"""
|
| 124 |
html_content = self._fetch_page(url)
|
|
|
|
| 130 |
domain = self._get_domain(url)
|
| 131 |
|
| 132 |
self.logger.info(f"Scraping article from domain: {domain}")
|
| 133 |
+
|
| 134 |
+
if 'politifact.com' in domain:
|
| 135 |
+
return self._extract_politifact(soup)
|
| 136 |
+
|
| 137 |
+
return self._extract_generic(soup, domain)
|