Spaces:
Sleeping
Sleeping
File size: 5,869 Bytes
e44062d 71dcf5a a2f475d e44062d 8e1fb27 d5cb87e 8e1fb27 d5cb87e 8e1fb27 d5cb87e 8e1fb27 d5cb87e 8e1fb27 d5cb87e 8e1fb27 d5cb87e 8e1fb27 d5cb87e 8e1fb27 cb75685 16d74db ce2ec6b 16d74db |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import pandas as pd
path_to_data = "./docStore/"
from appStore.prep_utils import create_chunks
from appStore.search import hybrid_search
def process_iati():
"""
this will read the iati files and create the chunks
"""
orgas_df = pd.read_csv(f"{path_to_data}iati_files/project_orgas.csv")
region_df = pd.read_csv(f"{path_to_data}iati_files/project_region.csv")
sector_df = pd.read_csv(f"{path_to_data}iati_files/project_sector.csv")
status_df = pd.read_csv(f"{path_to_data}iati_files/project_status.csv")
texts_df = pd.read_csv(f"{path_to_data}iati_files/project_texts.csv")
projects_df = pd.merge(orgas_df, region_df, on='iati_id', how='inner')
projects_df = pd.merge(projects_df, sector_df, on='iati_id', how='inner')
projects_df = pd.merge(projects_df, status_df, on='iati_id', how='inner')
projects_df = pd.merge(projects_df, texts_df, on='iati_id', how='inner')
projects_df = projects_df[projects_df.client.str.contains('bmz')].reset_index(drop=True)
projects_df.drop(columns= ['orga_abbreviation', 'client',
'orga_full_name', 'country',
'country_flag', 'crs_5_code', 'crs_3_code','country_code_list',
'sgd_pred_code','crs_5_name', 'crs_3_name', 'sgd_pred_str'], inplace=True)
#print(projects_df.columns)
projects_df['text_size'] = projects_df.apply(lambda x: len((x['title_main'] + x['description_main']).split()), axis=1)
projects_df['chunks'] = projects_df.apply(lambda x:create_chunks(x['title_main'] + x['description_main']),axis=1)
projects_df = projects_df.explode(column=['chunks'], ignore_index=True)
projects_df['source'] = 'IATI'
projects_df.rename(columns = {'iati_id':'id','iati_orga_id':'org'}, inplace=True)
return projects_df
def process_giz_worldwide():
"""
This will read the new giz_worldwide file and create the chunks.
The following adjustments have been made:
- Reads the file 'giz_worldwide_api_download_23_02_2025.json'
- Renames 'name.en' to 'project_name'
- Uses the 'merged_text' column for creating chunks and computing text size
- Creates an empty 'url' column (since the new dataset has an empty URL)
- Renames 'country' to 'countries'
- Renames 'duration.project.start' to 'start_year' and 'duration.project.end' to 'end_year'
"""
# Read the new JSON file
giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
# Sample random rows for quick embeddings (seed set for reproducibility)
giz_df = giz_df.sample(n=5, random_state=42)
# Rename columns per new dataset requirements
giz_df = giz_df.rename(columns={
'name.en': 'project_name',
'country': 'countries',
'duration.project.start': 'start_year',
'duration.project.end': 'end_year'
})
# Create an empty 'url' column as the new dataset has an empty URL
giz_df['url'] = ''
# Create text_size based on merged_text and create chunks from merged_text
giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
giz_df['chunks'] = giz_df['merged_text'].apply(lambda text: create_chunks(text) if isinstance(text, str) else [])
print("initial df length:", len(giz_df))
giz_df = giz_df.explode(column=['chunks'], ignore_index=True)
print("new df length:", len(giz_df))
print(giz_df.columns)
giz_df['source'] = 'GIZ_WORLDWIDE'
return giz_df
# def process_giz_worldwide():
# """
# this will read the giz_worldwide files and create the chunks
# """
# giz_df = pd.read_json(f'{path_to_data}giz_worldwide/data_giz_website.json')
# giz_df = giz_df.rename(columns={'content':'project_description'})
# # Sample random rows for quick embeddings (seed set for reproducibility)
# giz_df = giz_df.sample(n=5, random_state=42)
# giz_df['text_size'] = giz_df.apply(lambda x: len((x['project_name'] + x['project_description']).split()), axis=1)
# giz_df['chunks'] = giz_df.apply(lambda x:create_chunks(x['project_name'] + x['project_description']),axis=1)
# print("initial df length:",len(giz_df))
# giz_df = giz_df.explode(column=['chunks'], ignore_index=True)
# print("new df length:",len(giz_df))
# print(giz_df.columns)
# #giz_df.drop(columns = ['filename', 'url', 'name', 'mail',
# # 'language', 'start_year', 'end_year','poli_trager'], inplace=True)
# giz_df['source'] = 'GIZ_WORLDWIDE'
# return giz_df
def remove_duplicates(results_list):
"""
Return a new list of results with duplicates removed,
based on 'url' in metadata.
"""
unique_results = []
seen_urls = set()
for r in results_list:
# Safely get the URL from metadata
url = r.payload['metadata'].get('url', None)
if url not in seen_urls:
seen_urls.add(url)
unique_results.append(r)
return unique_results
def extract_year(date_str):
try:
return str(datetime.strptime(date_str, "%Y-%m-%d").year)
except Exception:
return "Unknown"
def get_max_end_year(_client, collection_name):
"""
Return the maximum 'end_year' in the entire collection
so we can set the slider's max_value dynamically.
"""
# For safety, get a large pool of items
all_res = hybrid_search(_client, "", collection_name, limit=2000)
big_list = all_res[0] + all_res[1]
years = []
for r in big_list:
metadata = r.payload.get('metadata', {})
year_str = metadata.get('end_year', None)
if year_str:
try:
years.append(float(year_str))
except ValueError:
pass
if not years:
# fallback if no valid end years found
return 2030
return int(max(years))
|