Skip to content

Commit

Permalink
Merge pull request NOAA-GFDL#58 from aradhakrishnanGFDL/combine-cats
Browse files Browse the repository at this point in the history
Combine cats
  • Loading branch information
ceblanton authored Sep 25, 2024
2 parents 4b7ce63 + 9c5d2f7 commit 876ceff
Showing 1 changed file with 84 additions and 0 deletions.
84 changes: 84 additions & 0 deletions catalogbuilder/scripts/combine_cats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#!/usr/bin/env python

import pandas as pd
import json
from jsondiff import diff
import pathlib
import sys
import os, click

test = False

if(test == True):
json1 = "/home/a1r/github/noaa-gfdl/catalogs/CM4.5v01_om5b06_piC_noBLING.json"
json2 = "/home/a1r/github/noaa-gfdl/catalogs/ESM4.5v01_om5b04_piC.json"
combined_json = "/home/a1r/github/noaa-gfdl/catalogs/combined_CM4.5v01_om5b06_piC_noBLING_and_ESM4.5v01_om5b04_piC.json"

@click.command()
@click.option('-i','--inputfiles',required=True,multiple=True,help='Pass json catalog files to-be-combined, space separated')
@click.option('-o','--output_path',required=True,nargs=1,help='Specify the output json path')

#Assume csv is in the same path and deduce the filename
def combine_cats(inputfiles,output_path):
"""This script combines two json catalogs. It takes the path to input catalogs and the output catalog names as input. \nThe options may be passed with this as the template: \n combine_catalogs.py -i jsoncatalog -i jsoncatalog2 -o outputjson \n\n Example usage: combine_cats.py -i /home/a1r/github/noaa-gfdl/catalogs/CM4.5v01_om5b06_piC_noBLING.json -i /home/a1r/github/noaa-gfdl/catalogs/ESM4.5v01_om5b04_piC.json -o combinedcat.json """
try:
json1 = inputfiles[0]
except:
sys.exit("cannot parse inputfiles")
try:
json2 = inputfiles[1]
except:
sys.exit("cannot parse inputfiles2")
try:
combined_json = output_path
except:
sys.exit("cannot parse output_path")
p1 = pathlib.PurePath(json1)
csv1 = p1.with_suffix('.csv')
print(csv1)
p2 = pathlib.Path(json2)
csv2 = p2.with_suffix('.csv')
print(csv2)

cat_csvs = [csv1,csv2] #TODO check for valid paths, pass it with cmd line if necessary

#####Check if the schema is the same
with open(json1) as f1, open(json2) as f2:
json_obj1 = json.load(f1)
json_obj2 = json.load(f2)
differ = diff(json_obj1, json_obj2)
print("INFO: Schema differs")
print(differ)
if len(differ.keys()) == 1:
if "catalog_file" in differ.keys():
print("We can combine since the catalog_file is the only difference")
else:
print("Schema likely varies significantly, cannot combine")
sys.exit()
#### If the headers are the same, append the data frames together and create the combined csv
p3 = pathlib.Path(combined_json)
combined_csv = p3.with_suffix('.csv')

df_concat = pd.concat([pd.read_csv(f) for f in cat_csvs], ignore_index = True)
#df_concat = pd.concat([pd.read_csv(f) for f in cat_csvs])
df_concat.to_csv(combined_csv, index=False)

#Write out a catalog specification
f = open(json1)
catspec = json.load(f)
for catalog_file in catspec['catalog_file']:
catspec['catalog_file'] = os.fspath(combined_csv)
#Write out the combined json

json_data = json.dumps(catspec,indent=4)
with open(combined_json,'w') as outfile:
outfile.write(json_data)
#Print pointers
print("Combined catalog specification- ", combined_json)
print("Combined csv/catalog- ", combined_csv)

def combine_cats_cli(**kwargs):
return combine_cats(**kwargs)

if __name__ == '__main__':
combine_cats_cli()

0 comments on commit 876ceff

Please sign in to comment.