File size: 5,869 Bytes
e44062d
 
71dcf5a
a2f475d
e44062d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e1fb27
d5cb87e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e1fb27
d5cb87e
 
8e1fb27
d5cb87e
 
 
8e1fb27
d5cb87e
 
 
 
8e1fb27
d5cb87e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e1fb27
d5cb87e
8e1fb27
d5cb87e
 
8e1fb27
 
 
cb75685
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16d74db
ce2ec6b
 
 
 
 
 
16d74db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import pandas as pd
path_to_data = "./docStore/"
from appStore.prep_utils import create_chunks
from appStore.search import hybrid_search

def process_iati():
    """
    this will read the iati files and create the chunks
    """
    orgas_df = pd.read_csv(f"{path_to_data}iati_files/project_orgas.csv")
    region_df = pd.read_csv(f"{path_to_data}iati_files/project_region.csv")
    sector_df = pd.read_csv(f"{path_to_data}iati_files/project_sector.csv")
    status_df = pd.read_csv(f"{path_to_data}iati_files/project_status.csv")
    texts_df = pd.read_csv(f"{path_to_data}iati_files/project_texts.csv")

    projects_df = pd.merge(orgas_df, region_df, on='iati_id', how='inner')
    projects_df = pd.merge(projects_df, sector_df, on='iati_id', how='inner')
    projects_df = pd.merge(projects_df, status_df, on='iati_id', how='inner')
    projects_df = pd.merge(projects_df, texts_df, on='iati_id', how='inner')
    projects_df = projects_df[projects_df.client.str.contains('bmz')].reset_index(drop=True)

    projects_df.drop(columns= ['orga_abbreviation', 'client',
       'orga_full_name', 'country', 
       'country_flag', 'crs_5_code', 'crs_3_code','country_code_list',
       'sgd_pred_code','crs_5_name', 'crs_3_name', 'sgd_pred_str'], inplace=True)
    #print(projects_df.columns)
    projects_df['text_size'] = projects_df.apply(lambda x: len((x['title_main'] + x['description_main']).split()), axis=1)
    projects_df['chunks'] = projects_df.apply(lambda x:create_chunks(x['title_main'] + x['description_main']),axis=1)
    projects_df = projects_df.explode(column=['chunks'], ignore_index=True)
    projects_df['source'] = 'IATI'
    projects_df.rename(columns = {'iati_id':'id','iati_orga_id':'org'}, inplace=True)

    return projects_df


def process_giz_worldwide():
    """
    This will read the new giz_worldwide file and create the chunks.
    The following adjustments have been made:
      - Reads the file 'giz_worldwide_api_download_23_02_2025.json'
      - Renames 'name.en' to 'project_name'
      - Uses the 'merged_text' column for creating chunks and computing text size
      - Creates an empty 'url' column (since the new dataset has an empty URL)
      - Renames 'country' to 'countries'
      - Renames 'duration.project.start' to 'start_year' and 'duration.project.end' to 'end_year'
    """
    # Read the new JSON file
    giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
    # Sample random rows for quick embeddings (seed set for reproducibility)
    giz_df = giz_df.sample(n=5, random_state=42)  
    # Rename columns per new dataset requirements
    giz_df = giz_df.rename(columns={
        'name.en': 'project_name',
        'country': 'countries',
        'duration.project.start': 'start_year',
        'duration.project.end': 'end_year'
    })
    
    # Create an empty 'url' column as the new dataset has an empty URL
    giz_df['url'] = ''
    
    # Create text_size based on merged_text and create chunks from merged_text
    giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
    giz_df['chunks'] = giz_df['merged_text'].apply(lambda text: create_chunks(text) if isinstance(text, str) else [])
    
    print("initial df length:", len(giz_df))
    giz_df = giz_df.explode(column=['chunks'], ignore_index=True)
    print("new df length:", len(giz_df))
    print(giz_df.columns)
    
    giz_df['source'] = 'GIZ_WORLDWIDE'
    return giz_df


# def process_giz_worldwide():
#     """
#     this will read the giz_worldwide files and create the chunks
#     """
#     giz_df = pd.read_json(f'{path_to_data}giz_worldwide/data_giz_website.json')
#     giz_df = giz_df.rename(columns={'content':'project_description'})
#     # Sample random rows for quick embeddings (seed set for reproducibility)
#     giz_df = giz_df.sample(n=5, random_state=42)    
#     giz_df['text_size'] = giz_df.apply(lambda x: len((x['project_name'] + x['project_description']).split()), axis=1)
#     giz_df['chunks'] = giz_df.apply(lambda x:create_chunks(x['project_name'] + x['project_description']),axis=1)
#     print("initial df length:",len(giz_df))
#     giz_df = giz_df.explode(column=['chunks'], ignore_index=True)
#     print("new df length:",len(giz_df))
#     print(giz_df.columns)
#     #giz_df.drop(columns = ['filename', 'url', 'name', 'mail', 
#     #                    'language', 'start_year', 'end_year','poli_trager'], inplace=True)
#     giz_df['source'] = 'GIZ_WORLDWIDE'
#     return giz_df

def remove_duplicates(results_list):
    """
    Return a new list of results with duplicates removed, 
    based on 'url' in metadata.
    """
    unique_results = []
    seen_urls = set()

    for r in results_list:
        # Safely get the URL from metadata
        url = r.payload['metadata'].get('url', None)
        if url not in seen_urls:
            seen_urls.add(url)
            unique_results.append(r)

    return unique_results

def extract_year(date_str):
    try:
        return str(datetime.strptime(date_str, "%Y-%m-%d").year)
    except Exception:
        return "Unknown"


def get_max_end_year(_client, collection_name):
    """
    Return the maximum 'end_year' in the entire collection 
    so we can set the slider's max_value dynamically.
    """
    # For safety, get a large pool of items
    all_res = hybrid_search(_client, "", collection_name, limit=2000)
    big_list = all_res[0] + all_res[1]

    years = []
    for r in big_list:
        metadata = r.payload.get('metadata', {})
        year_str = metadata.get('end_year', None)
        if year_str:
            try:
                years.append(float(year_str))
            except ValueError:
                pass

    if not years:
        # fallback if no valid end years found
        return 2030
    return int(max(years))