Spaces:
Runtime error
Runtime error
Nicky Nicolson
commited on
Commit
·
2cc6a74
1
Parent(s):
4d53d1c
Modified name parsing to use bionomia directly
Browse files- Dockerfile +7 -7
- tab2csv.py +38 -4
Dockerfile
CHANGED
|
@@ -18,15 +18,15 @@ RUN unzip /data/gbif-occs.zip -d /data
|
|
| 18 |
RUN ls -l /data
|
| 19 |
COPY ./tab2csv.py /code/tab2csv.py
|
| 20 |
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
RUN \
|
| 24 |
-
apt-get update && \
|
| 25 |
-
apt-get install -y ruby
|
| 26 |
-
RUN gem install dwc_agent
|
| 27 |
|
| 28 |
#COPY ./extractcollectorname.py /code/extractcollectorname.py
|
| 29 |
-
RUN python tab2csv.py --
|
| 30 |
#RUN python extractcollectorname.py /data/gbifocc-temp.csv /data/gbifocc.csv
|
| 31 |
RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
|
| 32 |
RUN ls -l /code
|
|
|
|
| 18 |
RUN ls -l /data
|
| 19 |
COPY ./tab2csv.py /code/tab2csv.py
|
| 20 |
|
| 21 |
+
## Setup to parse collector names using Bionomia utils (reqs Ruby)
|
| 22 |
+
## Install ruby
|
| 23 |
+
#RUN \
|
| 24 |
+
# apt-get update && \
|
| 25 |
+
# apt-get install -y ruby
|
| 26 |
+
#RUN gem install dwc_agent
|
| 27 |
|
| 28 |
#COPY ./extractcollectorname.py /code/extractcollectorname.py
|
| 29 |
+
RUN python tab2csv.py --createcols /data/${GBIF_DOWNLOAD_ID}.csv /data/gbifocc.csv
|
| 30 |
#RUN python extractcollectorname.py /data/gbifocc-temp.csv /data/gbifocc.csv
|
| 31 |
RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
|
| 32 |
RUN ls -l /code
|
tab2csv.py
CHANGED
|
@@ -1,18 +1,51 @@
|
|
| 1 |
import argparse
|
| 2 |
import pandas as pd
|
| 3 |
-
import
|
| 4 |
from tqdm import tqdm
|
| 5 |
tqdm.pandas()
|
| 6 |
|
| 7 |
-
def getFirstFamilyName(
|
| 8 |
firstFamilyName = None
|
| 9 |
-
parsed = bananompy.parse(
|
| 10 |
try:
|
| 11 |
firstFamilyName = parsed[0]['parsed'][0]['family']
|
| 12 |
except:
|
| 13 |
pass
|
| 14 |
return firstFamilyName
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
if __name__ == '__main__':
|
| 17 |
parser = argparse.ArgumentParser()
|
| 18 |
parser.add_argument("inputfile")
|
|
@@ -30,7 +63,8 @@ if __name__ == '__main__':
|
|
| 30 |
if args.createcols:
|
| 31 |
# Extract unique recordedBy values
|
| 32 |
df_rb = df[['recordedBy']].drop_duplicates()
|
| 33 |
-
df_rb
|
|
|
|
| 34 |
# Apply back to main dataframe
|
| 35 |
df = pd.merge(left = df, right=df_rb, left_on='recordedBy', right_on='recordedBy', how='left')
|
| 36 |
# Add column holding collector name and number
|
|
|
|
| 1 |
import argparse
|
| 2 |
import pandas as pd
|
| 3 |
+
import requests
|
| 4 |
from tqdm import tqdm
|
| 5 |
tqdm.pandas()
|
| 6 |
|
| 7 |
+
def getFirstFamilyName(recordedBy):
|
| 8 |
firstFamilyName = None
|
| 9 |
+
parsed = bananompy.parse(recordedBy)
|
| 10 |
try:
|
| 11 |
firstFamilyName = parsed[0]['parsed'][0]['family']
|
| 12 |
except:
|
| 13 |
pass
|
| 14 |
return firstFamilyName
|
| 15 |
|
| 16 |
+
def getFirstFamilyNames(recordedBy_l):
|
| 17 |
+
# post to bionomia
|
| 18 |
+
bionomia_parse_endpoint_url = "https://api.bionomia.net/parse.json"
|
| 19 |
+
data = dict()
|
| 20 |
+
data['names'] = '\r\n'.join(recordedBy_l)
|
| 21 |
+
r = requests.post(bionomia_parse_endpoint_url, data=data)
|
| 22 |
+
parsed_results = r.json()
|
| 23 |
+
results = dict()
|
| 24 |
+
for parsed_result in parsed_results:
|
| 25 |
+
try:
|
| 26 |
+
results[parsed_result['original']] = parsed_result['parsed'][0]['family']
|
| 27 |
+
except:
|
| 28 |
+
results[parsed_result['original']] = None
|
| 29 |
+
return results
|
| 30 |
+
|
| 31 |
+
def getFirstFamilyNameBulk(df,
|
| 32 |
+
recordedByColName="recordedBy",
|
| 33 |
+
firstFamilyNameColName="recordedBy_first_familyname",
|
| 34 |
+
batchsize=500):
|
| 35 |
+
results = dict()
|
| 36 |
+
recordedBy_l = []
|
| 37 |
+
for s in tqdm(df[recordedByColName].values):
|
| 38 |
+
if len(recordedBy_l) == batchsize:
|
| 39 |
+
# send it
|
| 40 |
+
results.update(getFirstFamilyNames(recordedBy_l))
|
| 41 |
+
# clear for next iteration
|
| 42 |
+
recordedBy_l = []
|
| 43 |
+
recordedBy_l.append(s)
|
| 44 |
+
if len(recordedBy_l) > 0:
|
| 45 |
+
results.update(getFirstFamilyNames(recordedBy_l))
|
| 46 |
+
df[firstFamilyNameColName] = df[recordedByColName].map(results)
|
| 47 |
+
return df
|
| 48 |
+
|
| 49 |
if __name__ == '__main__':
|
| 50 |
parser = argparse.ArgumentParser()
|
| 51 |
parser.add_argument("inputfile")
|
|
|
|
| 63 |
if args.createcols:
|
| 64 |
# Extract unique recordedBy values
|
| 65 |
df_rb = df[['recordedBy']].drop_duplicates()
|
| 66 |
+
df_rb = getFirstFamilyNameBulk(df_rb)
|
| 67 |
+
#df_rb['recordedBy_first_familyname'] = df_rb.recordedBy.progress_apply(getFirstFamilyName)
|
| 68 |
# Apply back to main dataframe
|
| 69 |
df = pd.merge(left = df, right=df_rb, left_on='recordedBy', right_on='recordedBy', how='left')
|
| 70 |
# Add column holding collector name and number
|