Spaces:
Runtime error
Runtime error
# models/location_analysis.py | |
from .model_loader import load_model | |
from geopy.geocoders import Nominatim | |
from .logging_config import logger | |
import re | |
import time | |
from typing import Dict, Any | |
from geopy.distance import geodesic | |
geocoder = Nominatim(user_agent="indian_property_verifier", timeout=10) | |
def validate_address_format(address: str) -> bool: | |
"""Validate the format of the address.""" | |
if not address: | |
return False | |
# Check for minimum length | |
if len(address.strip()) < 10: # Minimum reasonable length for an address | |
return False | |
# Check for minimum components | |
components = [comp.strip() for comp in address.split(',')] | |
if len(components) < 2: # At least area and city | |
return False | |
# Check for common address patterns | |
patterns = [ | |
r'\d+', # Should contain numbers | |
r'[A-Za-z\s]+', # Should contain letters | |
r'(?:street|road|avenue|lane|colony|society|apartment|flat|house|building|plot|block|sector|phase|floor|wing|area|locality|main|cross|circle|square|market|ward|zone|mandal|municipal|corporation|greater)', # Common address terms | |
] | |
# Check if at least 2 patterns match | |
pattern_matches = sum(1 for pattern in patterns if re.search(pattern, address.lower())) | |
if pattern_matches < 2: | |
return False | |
# Check for common address components | |
address_lower = address.lower() | |
has_location = any(term in address_lower for term in [ | |
'ward', 'zone', 'mandal', 'municipal', 'corporation', 'greater', | |
'street', 'road', 'avenue', 'lane', 'colony', 'society' | |
]) | |
has_area = any(term in address_lower for term in [ | |
'colony', 'society', 'apartment', 'flat', 'house', 'plot', 'block', 'sector', | |
'area', 'locality', 'main', 'cross', 'circle', 'square', 'market' | |
]) | |
return has_location or has_area | |
def validate_postal_code(postal_code: str) -> bool: | |
"""Validate Indian postal code format.""" | |
if not postal_code: | |
return False | |
# Remove any spaces and convert to string | |
postal_code = str(postal_code).strip().replace(' ', '') | |
# Check format | |
if not re.match(r'^\d{6}$', postal_code): | |
return False | |
# Validate first digit (region) | |
first_digit = int(postal_code[0]) | |
if first_digit not in range(1, 9): # India has 8 postal regions | |
return False | |
return True | |
def validate_coordinates(latitude: str, longitude: str) -> bool: | |
"""Validate coordinate format and range for India.""" | |
try: | |
# Convert to float and handle any string formatting | |
lat = float(str(latitude).strip()) | |
lng = float(str(longitude).strip()) | |
# India's approximate boundaries with some buffer | |
india_bounds = { | |
'lat_min': 6.0, # Slightly expanded for coastal areas | |
'lat_max': 38.0, # Slightly expanded for northern regions | |
'lng_min': 67.0, # Slightly expanded for western regions | |
'lng_max': 98.0 # Slightly expanded for eastern regions | |
} | |
# Check if coordinates are within India's boundaries | |
if not (india_bounds['lat_min'] <= lat <= india_bounds['lat_max'] and | |
india_bounds['lng_min'] <= lng <= india_bounds['lng_max']): | |
return False | |
# Check for reasonable precision (no more than 6 decimal places) | |
lat_str = f"{lat:.6f}" | |
lng_str = f"{lng:.6f}" | |
# Check if the original values match the formatted values | |
if abs(float(lat_str) - lat) > 0.000001 or abs(float(lng_str) - lng) > 0.000001: | |
return False | |
return True | |
except (ValueError, TypeError): | |
return False | |
def verify_location_in_city(address: str, city: str) -> bool: | |
"""Verify if the address exists in the given city.""" | |
if not address or not city: | |
return False | |
try: | |
# Clean and normalize inputs | |
address = address.strip() | |
city = city.strip() | |
# Extract key components from the address | |
address_components = [comp.strip() for comp in address.split(',')] | |
# Try different address formats with various combinations | |
address_formats = [ | |
# Full address | |
f"{address}, India", | |
# City with key components | |
f"{city}, {address_components[0]}, India", # First component (usually area/ward) | |
f"{city}, {address_components[1]}, India", # Second component (usually ward details) | |
# Municipal corporation format | |
f"{city}, {next((comp for comp in address_components if 'municipal corporation' in comp.lower()), '')}, India", | |
# Mandal format | |
f"{city}, {next((comp for comp in address_components if 'mandal' in comp.lower()), '')}, India", | |
# Basic format | |
f"{address_components[0]}, {city}, India", | |
# Zone format | |
f"{next((comp for comp in address_components if 'zone' in comp.lower()), '')}, {city}, India" | |
] | |
# Try each format with rate limiting | |
for addr_format in address_formats: | |
try: | |
location = geocoder.geocode(addr_format, timeout=10) | |
if location: | |
# Get the full address and normalize it | |
location_address = location.address.lower() | |
city_lower = city.lower() | |
# Check for city name in different formats | |
city_variations = [ | |
city_lower, | |
city_lower.replace(' ', ''), | |
city_lower.replace(' ', '-'), | |
f"{city_lower} city", | |
f"{city_lower} district", | |
f"{city_lower} municipal corporation", | |
f"greater {city_lower}", | |
f"greater {city_lower} municipal corporation" | |
] | |
# Check if any city variation is in the address | |
if any(var in location_address for var in city_variations): | |
# Additional verification: check if the address components match | |
location_components = [comp.strip().lower() for comp in location_address.split(',')] | |
# Check for key components | |
key_components = [ | |
comp.lower() for comp in address_components | |
if any(keyword in comp.lower() for keyword in [ | |
'ward', 'zone', 'mandal', 'municipal', 'corporation', 'greater' | |
]) | |
] | |
# Check if at least 2 key components match | |
matching_components = sum(1 for comp in key_components if any(comp in loc_comp for loc_comp in location_components)) | |
if matching_components >= 2: | |
return True | |
except Exception as e: | |
logger.debug(f"Error in address verification: {str(e)}") | |
continue | |
time.sleep(1) # Rate limiting | |
# If direct verification fails, try reverse geocoding | |
try: | |
# Get city coordinates | |
city_location = geocoder.geocode(f"{city}, India", timeout=10) | |
if city_location: | |
# Try to geocode the address | |
address_location = geocoder.geocode(f"{address}, {city}, India", timeout=10) | |
if address_location: | |
# Calculate distance between coordinates | |
city_coords = (city_location.latitude, city_location.longitude) | |
address_coords = (address_location.latitude, address_location.longitude) | |
distance = geodesic(city_coords, address_coords).kilometers | |
# Use tier-based distance threshold | |
city_lower = city.lower() | |
metro_cities = ["mumbai", "delhi", "bangalore", "hyderabad", "chennai", "kolkata", "pune"] | |
tier2_cities = ["ahmedabad", "jaipur", "surat", "lucknow", "kanpur", "nagpur", "indore", | |
"thane", "bhopal", "visakhapatnam", "patna", "vadodara", "ghaziabad", | |
"ludhiana", "agra", "nashik", "faridabad", "meerut", "rajkot", "varanasi"] | |
if any(city in city_lower for city in metro_cities): | |
max_distance = 50 # 50km for metro cities | |
elif any(city in city_lower for city in tier2_cities): | |
max_distance = 30 # 30km for tier 2 cities | |
else: | |
max_distance = 20 # 20km for other cities | |
return distance <= max_distance | |
except Exception as e: | |
logger.debug(f"Error in reverse geocoding: {str(e)}") | |
return False | |
except Exception as e: | |
logger.error(f"Error in location verification: {str(e)}") | |
return False | |
def verify_city_in_state(city: str, state: str) -> bool: | |
"""Verify if the city exists in the given state.""" | |
if not city or not state: | |
return False | |
try: | |
# Try different formats | |
formats = [ | |
f"{city}, {state}, India", | |
f"{state}, {city}, India", | |
f"{city}, {state}" | |
] | |
for fmt in formats: | |
try: | |
location = geocoder.geocode(fmt, timeout=10) | |
if location: | |
location_address = location.address.lower() | |
city_lower = city.lower() | |
state_lower = state.lower() | |
# Check for city and state names in different formats | |
city_variations = [ | |
city_lower, | |
city_lower.replace(' ', ''), | |
city_lower.replace(' ', '-') | |
] | |
state_variations = [ | |
state_lower, | |
state_lower.replace(' ', ''), | |
state_lower.replace(' ', '-') | |
] | |
if any(city_var in location_address for city_var in city_variations) and \ | |
any(state_var in location_address for state_var in state_variations): | |
return True | |
except: | |
continue | |
time.sleep(1) | |
return False | |
except: | |
return False | |
def verify_state_in_country(state: str, country: str = "India") -> bool: | |
"""Verify if the state exists in the given country.""" | |
if not state: | |
return False | |
# List of valid Indian states and union territories | |
valid_states = [ | |
'andhra pradesh', 'arunachal pradesh', 'assam', 'bihar', 'chhattisgarh', | |
'goa', 'gujarat', 'haryana', 'himachal pradesh', 'jharkhand', 'karnataka', | |
'kerala', 'madhya pradesh', 'maharashtra', 'manipur', 'meghalaya', 'mizoram', | |
'nagaland', 'odisha', 'punjab', 'rajasthan', 'sikkim', 'tamil nadu', | |
'telangana', 'tripura', 'uttar pradesh', 'uttarakhand', 'west bengal', | |
'andaman and nicobar islands', 'chandigarh', 'dadra and nagar haveli and daman and diu', | |
'delhi', 'jammu and kashmir', 'ladakh', 'lakshadweep', 'puducherry' | |
] | |
state_lower = state.lower() | |
return state_lower in valid_states | |
def verify_postal_code_in_city(postal_code: str, city: str) -> bool: | |
"""Verify if the postal code belongs to the given city.""" | |
if not postal_code or not city: | |
return False | |
try: | |
# Try different formats | |
formats = [ | |
f"{postal_code}, {city}, India", | |
f"{city}, {postal_code}, India", | |
f"{postal_code}, {city}" | |
] | |
for fmt in formats: | |
try: | |
location = geocoder.geocode(fmt, timeout=10) | |
if location: | |
location_address = location.address.lower() | |
city_lower = city.lower() | |
# Check for city name in different formats | |
city_variations = [ | |
city_lower, | |
city_lower.replace(' ', ''), | |
city_lower.replace(' ', '-') | |
] | |
if any(var in location_address for var in city_variations): | |
return True | |
except: | |
continue | |
time.sleep(1) | |
return False | |
except: | |
return False | |
def verify_coordinates_in_city(latitude: str, longitude: str, city: str) -> bool: | |
"""Verify if the coordinates are within the given city.""" | |
if not all([latitude, longitude, city]): | |
return False | |
try: | |
# Convert to float and handle any string formatting | |
lat = float(str(latitude).strip()) | |
lng = float(str(longitude).strip()) | |
# Get city coordinates | |
city_location = geocoder.geocode(f"{city}, India", timeout=10) | |
if not city_location: | |
return False | |
city_coords = (city_location.latitude, city_location.longitude) | |
property_coords = (lat, lng) | |
# Calculate distance between coordinates | |
distance = geodesic(city_coords, property_coords).kilometers | |
# Define maximum allowed distance based on city tier | |
city_lower = city.lower() | |
metro_cities = ["mumbai", "delhi", "bangalore", "hyderabad", "chennai", "kolkata", "pune"] | |
tier2_cities = ["ahmedabad", "jaipur", "surat", "lucknow", "kanpur", "nagpur", "indore", | |
"thane", "bhopal", "visakhapatnam", "patna", "vadodara", "ghaziabad", | |
"ludhiana", "agra", "nashik", "faridabad", "meerut", "rajkot", "varanasi"] | |
# Adjust max distance based on city tier | |
if any(city in city_lower for city in metro_cities): | |
max_distance = 50 # 50km for metro cities | |
elif any(city in city_lower for city in tier2_cities): | |
max_distance = 30 # 30km for tier 2 cities | |
else: | |
max_distance = 20 # 20km for other cities | |
return distance <= max_distance | |
except: | |
return False | |
def analyze_location(data: Dict[str, Any]) -> Dict[str, Any]: | |
"""Analyze location data with detailed verification.""" | |
try: | |
# Defensive: ensure data is a dict | |
if not isinstance(data, dict): | |
logger.warning(f"Input to analyze_location is not a dict: {type(data)}") | |
data = {} | |
# Defensive: ensure all expected keys exist | |
for key in ['address', 'city', 'state', 'zip', 'latitude', 'longitude', 'nearby_landmarks']: | |
if key not in data: | |
data[key] = '' | |
# Initialize verification results | |
verification_results = { | |
'address_format_valid': validate_address_format(data.get('address', '')), | |
'address_in_city': verify_location_in_city(data.get('address', ''), data.get('city', '')), | |
'city_in_state': verify_city_in_state(data.get('city', ''), data.get('state', '')), | |
'state_in_country': verify_state_in_country(data.get('state', '')), | |
'postal_code_valid': validate_postal_code(data.get('zip', '')), | |
'postal_code_in_city': verify_postal_code_in_city(data.get('zip', ''), data.get('city', '')), | |
'coordinates_valid': validate_coordinates(data.get('latitude', ''), data.get('longitude', '')), | |
'coordinates_in_city': verify_coordinates_in_city( | |
data.get('latitude', ''), | |
data.get('longitude', ''), | |
data.get('city', '') | |
) | |
} | |
# Calculate weighted completeness score with adjusted weights | |
weights = { | |
'address_format_valid': 0.15, | |
'address_in_city': 0.20, # Increased weight for address verification | |
'city_in_state': 0.10, | |
'state_in_country': 0.10, | |
'postal_code_valid': 0.10, | |
'postal_code_in_city': 0.10, | |
'coordinates_valid': 0.10, | |
'coordinates_in_city': 0.15 | |
} | |
completeness_score = sum( | |
weights[key] * 100 if result else 0 | |
for key, result in verification_results.items() | |
) | |
# Determine location quality with more lenient criteria | |
critical_checks = ['address_format_valid', 'city_in_state', 'state_in_country', 'postal_code_valid'] | |
secondary_checks = ['address_in_city', 'postal_code_in_city', 'coordinates_valid', 'coordinates_in_city'] | |
# Location is verified if all critical checks pass and at least 2 secondary checks pass | |
critical_passed = all(verification_results[check] for check in critical_checks) | |
secondary_passed = sum(1 for check in secondary_checks if verification_results[check]) | |
location_quality = "verified" if critical_passed and secondary_passed >= 2 else "unverified" | |
# Analyze landmarks | |
landmarks_analysis = { | |
'provided': bool(data.get('nearby_landmarks')), | |
'count': len(data.get('nearby_landmarks', '').split(',')) if data.get('nearby_landmarks') else 0, | |
'types': [] | |
} | |
if data.get('nearby_landmarks'): | |
landmark_types = { | |
'transport': ['station', 'metro', 'bus', 'railway', 'airport', 'terminal', 'depot', 'stand', 'stop'], | |
'education': ['school', 'college', 'university', 'institute', 'academy', 'campus', 'library'], | |
'healthcare': ['hospital', 'clinic', 'medical', 'health', 'diagnostic', 'pharmacy', 'dispensary'], | |
'shopping': ['mall', 'market', 'shop', 'store', 'bazaar', 'complex', 'plaza', 'retail', 'outlet'], | |
'entertainment': ['park', 'garden', 'theater', 'cinema', 'stadium', 'auditorium', 'playground'], | |
'business': ['office', 'business', 'corporate', 'commercial', 'industrial', 'tech park', 'hub'] | |
} | |
landmarks = [landmark.strip() for landmark in data['nearby_landmarks'].lower().split(',')] | |
for landmark in landmarks: | |
for type_name, keywords in landmark_types.items(): | |
if any(keyword in landmark for keyword in keywords): | |
if type_name not in landmarks_analysis['types']: | |
landmarks_analysis['types'].append(type_name) | |
# Determine city tier | |
city_tier = "unknown" | |
if data.get('city'): | |
city_lower = data['city'].lower() | |
metro_cities = ["mumbai", "delhi", "bangalore", "hyderabad", "chennai", "kolkata", "pune"] | |
tier2_cities = ["ahmedabad", "jaipur", "surat", "lucknow", "kanpur", "nagpur", "indore", | |
"thane", "bhopal", "visakhapatnam", "patna", "vadodara", "ghaziabad", | |
"ludhiana", "agra", "nashik", "faridabad", "meerut", "rajkot", "varanasi"] | |
if any(city in city_lower for city in metro_cities): | |
city_tier = "metro" | |
elif any(city in city_lower for city in tier2_cities): | |
city_tier = "tier2" | |
else: | |
city_tier = "tier3" | |
return { | |
**verification_results, | |
'assessment': "complete" if completeness_score >= 80 else "partial" if completeness_score >= 50 else "minimal", | |
'completeness_score': completeness_score, | |
'location_quality': location_quality, | |
'city_tier': city_tier, | |
'landmarks_analysis': landmarks_analysis, | |
'verification_status': "verified" if location_quality == "verified" else "unverified", | |
'formatted_address': f"{data.get('address', '')}, {data.get('city', '')}, {data.get('state', '')}, India - {data.get('zip', '')}" | |
} | |
except Exception as e: | |
logger.error(f"Error analyzing location: {str(e)}") | |
return { | |
'assessment': 'error', | |
'completeness_score': 0, | |
'location_quality': 'error', | |
'city_tier': 'unknown', | |
'landmarks_analysis': {'provided': False, 'count': 0, 'types': []}, | |
'verification_status': 'error', | |
'formatted_address': '', | |
'address_format_valid': False, | |
'address_in_city': False, | |
'city_in_state': False, | |
'state_in_country': False, | |
'postal_code_valid': False, | |
'postal_code_in_city': False, | |
'coordinates_valid': False, | |
'coordinates_in_city': False | |
} | |
def calculate_location_completeness(data): | |
# Define weights for different fields | |
weights = { | |
'address': 0.25, | |
'city': 0.20, | |
'state': 0.15, | |
'country': 0.05, | |
'zip': 0.10, | |
'latitude': 0.10, | |
'longitude': 0.10, | |
'nearby_landmarks': 0.05 | |
} | |
# Calculate weighted score | |
score = 0 | |
for field, weight in weights.items(): | |
if data[field]: | |
score += weight | |
return int(score * 100) | |