forked from NOAA-GFDL/CatalogBuilder
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request NOAA-GFDL#58 from aradhakrishnanGFDL/combine-cats
Combine cats
- Loading branch information
Showing
1 changed file
with
84 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
#!/usr/bin/env python | ||
|
||
import pandas as pd | ||
import json | ||
from jsondiff import diff | ||
import pathlib | ||
import sys | ||
import os, click | ||
|
||
test = False | ||
|
||
if(test == True): | ||
json1 = "/home/a1r/github/noaa-gfdl/catalogs/CM4.5v01_om5b06_piC_noBLING.json" | ||
json2 = "/home/a1r/github/noaa-gfdl/catalogs/ESM4.5v01_om5b04_piC.json" | ||
combined_json = "/home/a1r/github/noaa-gfdl/catalogs/combined_CM4.5v01_om5b06_piC_noBLING_and_ESM4.5v01_om5b04_piC.json" | ||
|
||
@click.command() | ||
@click.option('-i','--inputfiles',required=True,multiple=True,help='Pass json catalog files to-be-combined, space separated') | ||
@click.option('-o','--output_path',required=True,nargs=1,help='Specify the output json path') | ||
|
||
#Assume csv is in the same path and deduce the filename | ||
def combine_cats(inputfiles,output_path): | ||
"""This script combines two json catalogs. It takes the path to input catalogs and the output catalog names as input. \nThe options may be passed with this as the template: \n combine_catalogs.py -i jsoncatalog -i jsoncatalog2 -o outputjson \n\n Example usage: combine_cats.py -i /home/a1r/github/noaa-gfdl/catalogs/CM4.5v01_om5b06_piC_noBLING.json -i /home/a1r/github/noaa-gfdl/catalogs/ESM4.5v01_om5b04_piC.json -o combinedcat.json """ | ||
try: | ||
json1 = inputfiles[0] | ||
except: | ||
sys.exit("cannot parse inputfiles") | ||
try: | ||
json2 = inputfiles[1] | ||
except: | ||
sys.exit("cannot parse inputfiles2") | ||
try: | ||
combined_json = output_path | ||
except: | ||
sys.exit("cannot parse output_path") | ||
p1 = pathlib.PurePath(json1) | ||
csv1 = p1.with_suffix('.csv') | ||
print(csv1) | ||
p2 = pathlib.Path(json2) | ||
csv2 = p2.with_suffix('.csv') | ||
print(csv2) | ||
|
||
cat_csvs = [csv1,csv2] #TODO check for valid paths, pass it with cmd line if necessary | ||
|
||
#####Check if the schema is the same | ||
with open(json1) as f1, open(json2) as f2: | ||
json_obj1 = json.load(f1) | ||
json_obj2 = json.load(f2) | ||
differ = diff(json_obj1, json_obj2) | ||
print("INFO: Schema differs") | ||
print(differ) | ||
if len(differ.keys()) == 1: | ||
if "catalog_file" in differ.keys(): | ||
print("We can combine since the catalog_file is the only difference") | ||
else: | ||
print("Schema likely varies significantly, cannot combine") | ||
sys.exit() | ||
#### If the headers are the same, append the data frames together and create the combined csv | ||
p3 = pathlib.Path(combined_json) | ||
combined_csv = p3.with_suffix('.csv') | ||
|
||
df_concat = pd.concat([pd.read_csv(f) for f in cat_csvs], ignore_index = True) | ||
#df_concat = pd.concat([pd.read_csv(f) for f in cat_csvs]) | ||
df_concat.to_csv(combined_csv, index=False) | ||
|
||
#Write out a catalog specification | ||
f = open(json1) | ||
catspec = json.load(f) | ||
for catalog_file in catspec['catalog_file']: | ||
catspec['catalog_file'] = os.fspath(combined_csv) | ||
#Write out the combined json | ||
|
||
json_data = json.dumps(catspec,indent=4) | ||
with open(combined_json,'w') as outfile: | ||
outfile.write(json_data) | ||
#Print pointers | ||
print("Combined catalog specification- ", combined_json) | ||
print("Combined csv/catalog- ", combined_csv) | ||
|
||
def combine_cats_cli(**kwargs): | ||
return combine_cats(**kwargs) | ||
|
||
if __name__ == '__main__': | ||
combine_cats_cli() |