import gradio as gr from neo4j import GraphDatabase import logging from typing import List, Dict, Tuple import pandas as pd from datetime import datetime import os # Set up logging with more detailed format for debugging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) # Get database credentials from environment variables NEO4J_URL = os.getenv("NEO4J_URL") NEO4J_USER = os.getenv("NEO4J_USER") NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD") def format_neo4j_datetime(dt) -> str: """Convert Neo4j datetime to string format.""" if dt is None: logger.info("Received None datetime") return 'Unknown date' try: logger.info(f"Formatting datetime: {dt} of type {type(dt)}") if hasattr(dt, 'to_native'): dt = dt.to_native() logger.info(f"Converted to native: {dt} of type {type(dt)}") return dt.strftime('%Y-%m-%d') except Exception as e: logger.warning(f"Error formatting datetime: {e}") return 'Unknown date' def is_displayable_keyword(keyword: str) -> bool: """ Check if a keyword should be displayed (not just numbers and separators). Filters out: - Pure numbers (1234) - Numbers with dots (15.10) - Numbers with dashes (2023-01) - Numbers with spaces (15 10) - Numbers with slashes (15/10) - Any combination of above """ if not keyword: return False # Remove all common number separators and spaces cleaned = keyword.replace('.', '') \ .replace('-', '') \ .replace('/', '') \ .replace('\\', '') \ .replace(' ', '') \ .replace(':', '') \ .replace(',', '') # Check if what remains is just digits return not cleaned.isdigit() def format_interest_list_for_display(interests: set, max_items: int = 10) -> str: """Format a list of interests for display, hiding numeric-only keywords.""" if not interests: return 'None' # Filter numeric keywords only for display displayable_interests = {interest for interest in interests if is_displayable_keyword(interest)} if not displayable_interests: return 'None' sorted_interests = sorted(displayable_interests) if len(sorted_interests) <= max_items: return ', '.join(sorted_interests) return f"{', '.join(sorted_interests[:max_items])} (+{len(sorted_interests) - max_items} more)" class QuestionRecommender: def __init__(self): try: self.driver = GraphDatabase.driver( NEO4J_URL, auth=(NEO4J_USER, NEO4J_PASSWORD) ) logger.info("Initializing QuestionRecommender with debug database") # Test connection immediately self.driver.verify_connectivity() logger.info("Successfully connected to Neo4j database") self.verify_connection() # Inspect question types on initialization self.inspect_question_types() except Exception as e: logger.error(f"Failed to initialize database connection: {str(e)}") raise def verify_connection(self): """Verify database connection and log basic statistics.""" try: with self.driver.session() as session: # First try a simple query to verify connection test_result = session.run("MATCH (n) RETURN count(n) as count").single() if not test_result: raise Exception("Could not execute test query") logger.info(f"Database contains {test_result['count']} total nodes") # Get database statistics with relationship counts stats = session.run(""" // Count nodes MATCH (u:User) WITH COUNT(u) as user_count MATCH (k:Keyword) WITH user_count, COUNT(k) as keyword_count MATCH (q:Question) WITH user_count, keyword_count, COUNT(q) as question_count MATCH (t:Topic) WITH user_count, keyword_count, question_count, COUNT(t) as topic_count // Count relationships OPTIONAL MATCH ()-[r:INTERESTED_IN_KEYWORD]->() WITH user_count, keyword_count, question_count, topic_count, COUNT(r) as keyword_rel_count OPTIONAL MATCH ()-[r:INTERESTED_IN_TOPIC]->() WITH user_count, keyword_count, question_count, topic_count, keyword_rel_count, COUNT(r) as topic_rel_count OPTIONAL MATCH ()-[r:HAS_KEYWORD]->() WITH user_count, keyword_count, question_count, topic_count, keyword_rel_count, topic_rel_count, COUNT(r) as question_keyword_count OPTIONAL MATCH ()-[r:HAS_TOPIC]->() RETURN user_count, keyword_count, question_count, topic_count, keyword_rel_count, topic_rel_count, question_keyword_count, COUNT(r) as question_topic_count """).single() if not stats: raise Exception("Could not retrieve database statistics") logger.info("=== Database Statistics ===") logger.info(f"Nodes:") logger.info(f" Users: {stats['user_count']}") logger.info(f" Keywords: {stats['keyword_count']}") logger.info(f" Questions: {stats['question_count']}") logger.info(f" Topics: {stats['topic_count']}") logger.info(f"\nRelationships:") logger.info(f" User->Keyword (INTERESTED_IN_KEYWORD): {stats['keyword_rel_count']}") logger.info(f" User->Topic (INTERESTED_IN_TOPIC): {stats['topic_rel_count']}") logger.info(f" Question->Keyword (HAS_KEYWORD): {stats['question_keyword_count']}") logger.info(f" Question->Topic (HAS_TOPIC): {stats['question_topic_count']}") except Exception as e: logger.error(f"Database verification failed: {str(e)}") logger.error(f"URL: {NEO4J_URL}") logger.error(f"User: {NEO4J_USER}") raise Exception(f"Failed to verify database connection: {str(e)}") def inspect_question_types(self): """Inspect different types of questions and their attributes in the database.""" with self.driver.session() as session: try: # Get all distinct question types and their properties result = session.run(""" MATCH (q:Question) WITH DISTINCT keys(q) as props, labels(q) as types RETURN types, props, count(*) as count ORDER BY count DESC """) logger.info("\n=== Question Types and Properties ===") for record in result: types = record["types"] props = record["props"] count = record["count"] logger.info(f"\nType: {types}") logger.info(f"Count: {count}") logger.info("Properties:") for prop in props: # Get a sample value for this property sample = session.run(""" MATCH (q:Question) WHERE $prop in keys(q) RETURN q[$prop] as value LIMIT 1 """, prop=prop).single() value = sample["value"] if sample else None value_type = type(value).__name__ if value is not None else "None" logger.info(f" - {prop}: {value_type} (example: {str(value)[:100]}{'...' if str(value)[100:] else ''})") # Get relationships specific to different question types result = session.run(""" MATCH (q:Question)-[r]->(target) WITH DISTINCT type(r) as rel_type, labels(target) as target_labels, count(*) as count RETURN rel_type, target_labels, count ORDER BY count DESC """) logger.info("\n=== Question Relationships ===") for record in result: rel_type = record["rel_type"] target_labels = record["target_labels"] count = record["count"] logger.info(f"Relationship: {rel_type} -> {target_labels} (Count: {count})") except Exception as e: logger.error(f"Error inspecting question types: {str(e)}") raise def close(self): self.driver.close() def get_all_users(self) -> List[str]: """Get list of all users with interest counts.""" with self.driver.session() as session: try: # Get users with their interest counts using proper relationship patterns result = session.run(""" MATCH (u:User) OPTIONAL MATCH (u)-[r:INTERESTED_IN_KEYWORD|INTERESTED_IN_TOPIC]->(interest) WITH u, COUNT(DISTINCT CASE WHEN type(r) = 'INTERESTED_IN_KEYWORD' THEN interest END) as keyword_count, COUNT(DISTINCT CASE WHEN type(r) = 'INTERESTED_IN_TOPIC' THEN interest END) as topic_count WHERE keyword_count > 0 OR topic_count > 0 RETURN u.name as username, keyword_count, topic_count, keyword_count + topic_count as total_interests ORDER BY total_interests DESC, username """) users_with_counts = [( record["username"], record["keyword_count"], record["topic_count"] ) for record in result if record["username"]] if not users_with_counts: logger.warning("No users found with interests") return [] logger.info(f"Retrieved {len(users_with_counts)} users with interests") logger.info("Top 5 users by interest count:") for username, kw_count, topic_count in users_with_counts[:5]: logger.info(f" - {username}: {kw_count} keywords, {topic_count} topics") # Format usernames with their counts return [ f"{username} ({kw_count} keywords, {topic_count} topics)" for username, kw_count, topic_count in users_with_counts ] except Exception as e: logger.error(f"Error fetching users: {str(e)}") return [] def get_user_interests(self, username: str) -> Dict[str, set]: """Get keywords and topics a user is interested in.""" with self.driver.session() as session: # Get keywords the user is interested in keyword_result = session.run(""" MATCH (u:User {name: $username})-[:INTERESTED_IN_KEYWORD]->(k:Keyword) RETURN DISTINCT k.keyword as keyword """, username=username) keywords = {str(record["keyword"]) for record in keyword_result if record["keyword"]} # Log keyword count for debugging logger.debug(f"Found {len(keywords)} keywords for user {username}") # Get topics the user is interested in topic_result = session.run(""" MATCH (u:User {name: $username})-[:INTERESTED_IN_TOPIC]->(t:Topic) RETURN DISTINCT t.topic as topic """, username=username) topics = {str(record["topic"]) for record in topic_result if record["topic"]} # Log topic count for debugging logger.debug(f"Found {len(topics)} topics for user {username}") return {"keywords": keywords or set(), "topics": topics or set()} def find_common_questions(self, user1: str, user2: str, max_questions: int = 5) -> List[Dict]: """Find questions to recommend based on common interests using advanced Neo4j features.""" with self.driver.session() as session: # Debug: Check if users exist and have interests user_check = session.run(""" MATCH (u1:User {name: $user1}) MATCH (u2:User {name: $user2}) OPTIONAL MATCH (u1)-[r1:INTERESTED_IN_KEYWORD|INTERESTED_IN_TOPIC]->(interest1) OPTIONAL MATCH (u2)-[r2:INTERESTED_IN_KEYWORD|INTERESTED_IN_TOPIC]->(interest2) RETURN COUNT(DISTINCT u1) as user1_exists, COUNT(DISTINCT u2) as user2_exists, COUNT(DISTINCT interest1) as user1_interests, COUNT(DISTINCT interest2) as user2_interests """, user1=user1, user2=user2).single() if not (user_check and user_check['user1_exists'] and user_check['user2_exists']): logger.error(f"One or both users not found: {user1}, {user2}") return [] logger.info(f"User {user1} has {user_check['user1_interests']} total interests") logger.info(f"User {user2} has {user_check['user2_interests']} total interests") # Advanced question recommendation query using Neo4j path finding and scoring questions_query = """ // Find all interests (both keywords and topics) for both users MATCH (u1:User {name: $user1}) MATCH (u2:User {name: $user2}) // Get all interests for both users OPTIONAL MATCH (u1)-[r1:INTERESTED_IN_KEYWORD|INTERESTED_IN_TOPIC]->(interest1) OPTIONAL MATCH (u2)-[r2:INTERESTED_IN_KEYWORD|INTERESTED_IN_TOPIC]->(interest2) WITH u1, u2, COLLECT(DISTINCT interest1) as u1_interests, COLLECT(DISTINCT interest2) as u2_interests // Find questions related to either user's interests for each source CALL { WITH u1, u2, u1_interests, u2_interests UNWIND u1_interests + u2_interests as interest MATCH (q:Question)-[r:HAS_KEYWORD|HAS_TOPIC]->(interest) WHERE q.author <> $user1 AND q.author <> $user2 AND q.source = 'stack_exchange' AND ( (interest IN u1_interests AND interest IN u2_interests) OR (interest IN u1_interests AND EXISTS((q)-[:HAS_KEYWORD|HAS_TOPIC]->()<-[:INTERESTED_IN_KEYWORD|INTERESTED_IN_TOPIC]-(u2))) OR (interest IN u2_interests AND EXISTS((q)-[:HAS_KEYWORD|HAS_TOPIC]->()<-[:INTERESTED_IN_KEYWORD|INTERESTED_IN_TOPIC]-(u1))) ) WITH q, interest, type(r) as rel_type, CASE WHEN interest IN u1_interests AND interest IN u2_interests THEN 2.0 ELSE 1.0 END as interest_weight WITH q, collect({interest: interest, weight: interest_weight, type: rel_type}) as interests, sum(interest_weight) as base_score RETURN q, interests, base_score ORDER BY base_score * rand() DESC LIMIT 15 // Increased from 10 to get more variety UNION WITH u1, u2, u1_interests, u2_interests UNWIND u1_interests + u2_interests as interest MATCH (q:Question)-[r:HAS_KEYWORD|HAS_TOPIC]->(interest) WHERE q.source = 'trivia' AND ( (interest IN u1_interests AND interest IN u2_interests) OR (interest IN u1_interests AND EXISTS((q)-[:HAS_KEYWORD|HAS_TOPIC]->()<-[:INTERESTED_IN_KEYWORD|INTERESTED_IN_TOPIC]-(u2))) OR (interest IN u2_interests AND EXISTS((q)-[:HAS_KEYWORD|HAS_TOPIC]->()<-[:INTERESTED_IN_KEYWORD|INTERESTED_IN_TOPIC]-(u1))) ) WITH q, interest, type(r) as rel_type, CASE WHEN interest IN u1_interests AND interest IN u2_interests THEN 2.0 ELSE 1.0 END as interest_weight WITH q, collect({interest: interest, weight: interest_weight, type: rel_type}) as interests, sum(interest_weight) as base_score RETURN q, interests, base_score ORDER BY base_score * rand() DESC LIMIT 15 // Increased from 10 to get more variety UNION WITH u1, u2, u1_interests, u2_interests UNWIND u1_interests + u2_interests as interest MATCH (q:Question)-[r:HAS_KEYWORD|HAS_TOPIC]->(interest) WHERE q.source = 'wikipedia' AND ( (interest IN u1_interests AND interest IN u2_interests) OR (interest IN u1_interests AND EXISTS((q)-[:HAS_KEYWORD|HAS_TOPIC]->()<-[:INTERESTED_IN_KEYWORD|INTERESTED_IN_TOPIC]-(u2))) OR (interest IN u2_interests AND EXISTS((q)-[:HAS_KEYWORD|HAS_TOPIC]->()<-[:INTERESTED_IN_KEYWORD|INTERESTED_IN_TOPIC]-(u1))) ) WITH q, interest, type(r) as rel_type, CASE WHEN interest IN u1_interests AND interest IN u2_interests THEN 2.0 ELSE 1.0 END as interest_weight WITH q, collect({interest: interest, weight: interest_weight, type: rel_type}) as interests, sum(interest_weight) as base_score RETURN q, interests, base_score ORDER BY base_score * rand() DESC LIMIT 15 // Increased from 10 to get more variety UNION WITH u1, u2, u1_interests, u2_interests UNWIND u1_interests + u2_interests as interest MATCH (q:Question)-[r:HAS_KEYWORD|HAS_TOPIC]->(interest) WHERE q.source = 'reddit' AND ( (interest IN u1_interests AND interest IN u2_interests) OR (interest IN u1_interests AND EXISTS((q)-[:HAS_KEYWORD|HAS_TOPIC]->()<-[:INTERESTED_IN_KEYWORD|INTERESTED_IN_TOPIC]-(u2))) OR (interest IN u2_interests AND EXISTS((q)-[:HAS_KEYWORD|HAS_TOPIC]->()<-[:INTERESTED_IN_KEYWORD|INTERESTED_IN_TOPIC]-(u1))) ) WITH q, interest, type(r) as rel_type, CASE WHEN interest IN u1_interests AND interest IN u2_interests THEN 2.0 ELSE 1.0 END as interest_weight WITH q, collect({interest: interest, weight: interest_weight, type: rel_type}) as interests, sum(interest_weight) as base_score RETURN q, interests, base_score ORDER BY base_score * rand() DESC LIMIT 15 // Increased from 10 to get more variety } // Calculate temporal relevance for the combined results WITH q, interests, base_score, CASE WHEN q.created_utc_ts IS NOT NULL THEN base_score * (1.0 + 0.1 * (1.0 - duration.between(q.created_utc_ts, datetime()).days / 365.0)) ELSE base_score END as temporal_score, // Add source-specific random boost to ensure better mixing CASE q.source WHEN 'stack_exchange' THEN rand() * 0.4 WHEN 'trivia' THEN rand() * 0.4 WHEN 'wikipedia' THEN rand() * 0.4 WHEN 'reddit' THEN rand() * 0.4 ELSE rand() * 0.4 END as source_random_boost // Return results with all metadata WITH q, interests, temporal_score, source_random_boost, temporal_score * (0.6 + 0.8 * rand()) + source_random_boost as final_score RETURN DISTINCT q.title as title, q.body as body, q.created_utc_ts as created_utc_ts, q.author as author, q.source as source, q.correct_answer as correct_answer, q.incorrect_answers as incorrect_answers, q.upvotes as upvotes, q.num_comments as num_comments, q.subreddit as subreddit, [i in interests | CASE WHEN i.type = 'HAS_KEYWORD' THEN i.interest.keyword ELSE i.interest.topic END] as matching_interests, [i in interests | CASE WHEN i.type = 'HAS_KEYWORD' THEN 'keyword' ELSE 'topic' END] as interest_types, final_score as relevance_score ORDER BY final_score DESC LIMIT $max_questions """ questions = [dict(record) for record in session.run(questions_query, user1=user1, user2=user2, max_questions=max_questions)] if questions: first_q = questions[0] logger.info(f"Sample question:") logger.info(f"Title: {first_q.get('title', 'No title')}") logger.info(f"Author: {first_q.get('author', 'No author')}") logger.info(f"Score: {first_q.get('relevance_score', 0)}") logger.info(f"Interests: {first_q.get('matching_interests', [])}") logger.info(f"Found {len(questions)} questions with common interests") return questions def process_body(text, title): """Process question body to handle images and HTML.""" if not text: logger.warning(f"Empty body for question: {title}") return "" try: from bs4 import BeautifulSoup # Parse the HTML content soup = BeautifulSoup(str(text), 'html.parser') # Function to fix Stack Exchange URLs def fix_stack_exchange_url(url): if not url: return url if url.startswith(('http://', 'https://')): return url if url.startswith('//'): return 'https:' + url if url.startswith('/'): return 'https://i.stack.imgur.com' + url return 'https://i.stack.imgur.com/' + url # Find all img tags and replace with preview cards for img in soup.find_all('img'): src = img.get('src', '') if not src: continue fixed_src = fix_stack_exchange_url(src) alt_text = img.get('alt', '').strip() if not alt_text or alt_text.lower() == 'enter image description here': alt_text = 'Question image' # Create an image preview card preview_html = f"""