.git/configure
to set up upstream
remote:[remote "upstream"]
url = <URL of GitHub repository or Azure mirror>
fetch = +refs/heads/*:refs/remotes/upstream/*
fetch = +refs/pull/*/head:refs/remotes/upstream/pr/*
Initialize LFS hooks and fetch references from upstream
:
$ git lfs install
$ git fetch upstream
This notebook uses the bokeh
plotting backend by default. Install it alongside your tardis
environment by doing:
$ conda activate tardis
$ conda install bokeh -c conda-forge --no-update-deps
tardis-refdata/notebooks
directory.ReferenceComparer object loads two versions of the reference data by passing at least one Git label (e.g. hash, tag, branch name). If either is set to None
it will just use the current data in the directory. For example:
comparer = ReferenceComparer(ref1_hash=None, ref2_hash='upstream/master')
compares unit_test_data.h5
from your local repository against the HEAD
of the upstream
remote.
Please set the labels you want to compare now:
REF1_HASH_DEFAULT = None
REF2_HASH_DEFAULT = 'upstream/master'
This feature is specially useful for CI pipelines: ref1_hash
and ref2_hash
parameters can be passed as environment variables before running the notebook, overwriting the defaults defined in the above cell.
export REF2_HASH='upstream/master'
If you want to switch to the matplotlib
backend pass the mpl_backend=True
option to the compare_output_nu
and compare_spectrum
function.
.teardown()
method to delete temporary files.import os
import shutil
import tempfile
import subprocess
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.layouts import gridplot
from bokeh.models.tools import HoverTool
from bokeh.io import output_notebook
output_notebook()
Make sure you are in the root of tardis-refdata
after running the following cell.
cd ..
/home/vsts/work/1/s/tardis-refdata
def highlight_missing(val):
if val == True:
return 'background-color: #BCF5A9'
else:
return 'background-color: #F5A9A9'
def highlight_relative_difference(val):
ret = 'background-color: #BCF5A9'
if val is None:
ret = 'background-color: #BCF5A9'
elif val > 1e-2:
ret = 'background-color: #F2F5A9'
elif val > 1e-1:
ret = 'background-color: #F5D0A9'
elif val > 1:
ret = 'background-color: #F5A9A9'
return ret
class ReferenceComparer(object):
def __init__(self, ref1_hash=None, ref2_hash=None, compare_path='unit_test_data.h5'):
assert not ((ref1_hash is None) and (ref2_hash is None)), "One hash can not be None"
self.ref1_hash = ref1_hash
self.ref2_hash = ref2_hash
self.compare_path = compare_path
self.tmp_dir = None
self.setup()
def setup(self):
self.tmp_dir = tempfile.mkdtemp()
print('Created temporary directory at {0}. Delete after use with .teardown'.format(self.tmp_dir))
for ref_id, ref_hash in enumerate([self.ref1_hash, self.ref2_hash]):
ref_id += 1
if ref_hash is not None:
self._copy_data_from_hash(ref_hash, 'ref{0}_'.format(ref_id))
else:
subprocess.Popen('cp {0} {1}'.format(self.compare_path,
os.path.join(self.tmp_dir,
'ref{0}_{1}'.format(ref_id, self.compare_path))),
shell=True)
setattr(self, 'ref{0}_fname'.format(ref_id),
os.path.join(self.tmp_dir, 'ref{0}_{1}'.format(ref_id, self.compare_path)))
def teardown(self):
shutil.rmtree(self.tmp_dir)
def _copy_data_from_hash(self, ref_hash, prefix):
git_cmd = ['git']
git_cmd.append('--work-tree={0}'.format(self.tmp_dir))
git_cmd += ['checkout', ref_hash, self.compare_path]
p = subprocess.Popen(git_cmd)
p.wait()
shutil.move(os.path.join(self.tmp_dir, self.compare_path),
os.path.join(self.tmp_dir, prefix + self.compare_path))
def generate_test_table(self):
rd1_hdfs = pd.HDFStore(self.ref1_fname, mode='r')
rd2_hdfs = pd.HDFStore(self.ref2_fname, mode='r')
rd1_keys = rd1_hdfs.keys()
rd2_keys = rd2_hdfs.keys()
rd1_hdfs.close()
rd2_hdfs.close()
rd1_df = pd.DataFrame(index=rd1_keys, columns=['exists'])
rd2_df = pd.DataFrame(index=rd2_keys, columns=['exists'])
rd1_df['exists'] = True
rd2_df['exists'] = True
joined_df = rd1_df.join(rd2_df, how='outer', lsuffix='_1', rsuffix='_2')
joined_df = joined_df.fillna(False)
return joined_df
def compare_refdata(self, test_table):
test_table['match'] = None
test_table['abs_diff_mean'] = None
test_table['abs_diff_max'] = None
test_table['rel_diff_mean'] = None
test_table['rel_diff_max'] = None
for row_id, row in test_table.iterrows():
if row[['exists_1', 'exists_2']].all():
ref1_df = pd.read_hdf(self.ref1_fname, row_id)
ref2_df = pd.read_hdf(self.ref2_fname, row_id)
if isinstance(ref1_df, pd.Series):
try:
pd.testing.assert_series_equal(ref1_df, ref2_df)
except AssertionError:
test_table.loc[row_id, 'match'] = False
abs_diff = np.fabs(ref1_df - ref2_df)
rel_diff = (abs_diff / np.fabs(ref1_df))[ref1_df != 0]
test_table.loc[row_id, 'abs_diff_mean'] = abs_diff.mean()
test_table.loc[row_id, 'abs_diff_max'] = abs_diff.max()
test_table.loc[row_id, 'rel_diff_mean'] = rel_diff.mean()
test_table.loc[row_id, 'rel_diff_max'] = rel_diff.max()
else:
test_table.loc[row_id, 'match'] = True
elif isinstance(ref1_df, pd.DataFrame):
try:
pd.testing.assert_frame_equal(ref1_df, ref2_df)
except AssertionError:
test_table.loc[row_id, 'match'] = False
abs_diff = np.fabs(ref1_df - ref2_df)
rel_diff = (abs_diff / np.fabs(ref1_df))[ref1_df != 0]
test_table.loc[row_id, 'abs_diff_mean'] = abs_diff.mean(skipna=True).mean()
test_table.loc[row_id, 'abs_diff_max'] = abs_diff.max(skipna=True).max()
test_table.loc[row_id, 'rel_diff_mean'] = rel_diff.mean(skipna=True).mean()
test_table.loc[row_id, 'rel_diff_max'] = rel_diff.max(skipna=True).max()
else:
test_table.loc[row_id, 'match'] = True
else:
raise ValueError('Needs to be a Series or DataFrame but is' + str(type(ref1_df)))
return test_table
Check if REF1_HASH
and REF2_HASH
are environment variables. If not defined (or empty), use the defaults defined above.
try:
REF1_HASH = os.environ['REF1_HASH']
if not REF1_HASH:
raise ValueError
except (KeyError, ValueError):
REF1_HASH = REF1_HASH_DEFAULT
try:
REF2_HASH = os.environ['REF2_HASH']
if not REF2_HASH:
raise ValueError
except (KeyError, ValueError):
REF2_HASH = REF2_HASH_DEFAULT
REF1_HASH, REF2_HASH
('1b73a6bb74d56636101e62257774c43009fabc88', '34a759c65050784e7dd799d1358a464abc04653c')
comparer = ReferenceComparer(ref1_hash=REF1_HASH, ref2_hash=REF2_HASH)
Created temporary directory at /tmp/tmp6l5fj_tn. Delete after use with .teardown
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) /usr/share/miniconda/envs/tardis/lib/python3.8/shutil.py in move(src, dst, copy_function) 787 try: --> 788 os.rename(src, real_dst) 789 except OSError: FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmp6l5fj_tn/unit_test_data.h5' -> '/tmp/tmp6l5fj_tn/ref2_unit_test_data.h5' During handling of the above exception, another exception occurred: FileNotFoundError Traceback (most recent call last) <ipython-input-1-7430fed7be7e> in <module> ----> 1 comparer = ReferenceComparer(ref1_hash=REF1_HASH, ref2_hash=REF2_HASH) <ipython-input-1-aa05e4877f43> in __init__(self, ref1_hash, ref2_hash, compare_path) 7 self.compare_path = compare_path 8 self.tmp_dir = None ----> 9 self.setup() 10 11 def setup(self): <ipython-input-1-aa05e4877f43> in setup(self) 15 ref_id += 1 16 if ref_hash is not None: ---> 17 self._copy_data_from_hash(ref_hash, 'ref{0}_'.format(ref_id)) 18 else: 19 subprocess.Popen('cp {0} {1}'.format(self.compare_path, <ipython-input-1-aa05e4877f43> in _copy_data_from_hash(self, ref_hash, prefix) 33 p = subprocess.Popen(git_cmd) 34 p.wait() ---> 35 shutil.move(os.path.join(self.tmp_dir, self.compare_path), 36 os.path.join(self.tmp_dir, prefix + self.compare_path)) 37 /usr/share/miniconda/envs/tardis/lib/python3.8/shutil.py in move(src, dst, copy_function) 800 rmtree(src) 801 else: --> 802 copy_function(src, real_dst) 803 os.unlink(src) 804 return real_dst /usr/share/miniconda/envs/tardis/lib/python3.8/shutil.py in copy2(src, dst, follow_symlinks) 430 if os.path.isdir(dst): 431 dst = os.path.join(dst, os.path.basename(src)) --> 432 copyfile(src, dst, follow_symlinks=follow_symlinks) 433 copystat(src, dst, follow_symlinks=follow_symlinks) 434 return dst /usr/share/miniconda/envs/tardis/lib/python3.8/shutil.py in copyfile(src, dst, follow_symlinks) 259 os.symlink(os.readlink(src), dst) 260 else: --> 261 with open(src, 'rb') as fsrc, open(dst, 'wb') as fdst: 262 # macOS 263 if _HAS_FCOPYFILE: FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmp6l5fj_tn/unit_test_data.h5'
tt = comparer.generate_test_table()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-8117ee489402> in <module> ----> 1 tt = comparer.generate_test_table() NameError: name 'comparer' is not defined
tt = comparer.compare_refdata(tt)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-197587a6f814> in <module> ----> 1 tt = comparer.compare_refdata(tt) NameError: name 'comparer' is not defined
tt[["exists_1", "exists_2", 'rel_diff_mean', 'rel_diff_max', 'match']].style.applymap(
highlight_missing, subset=['exists_1', 'exists_2', 'match']).applymap(
highlight_relative_difference, subset=['rel_diff_mean', 'rel_diff_max'])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-49edc6aa32a1> in <module> ----> 1 tt[["exists_1", "exists_2", 'rel_diff_mean', 'rel_diff_max', 'match']].style.applymap( 2 highlight_missing, subset=['exists_1', 'exists_2', 'match']).applymap( 3 highlight_relative_difference, subset=['rel_diff_mean', 'rel_diff_max']) NameError: name 'tt' is not defined
If parts of the reference data show differences between revisions, you should invest some time examining these differences in detail. Often, visualizing the relevant data blocks already helps.
You can use the following plotting routines as a blueprint and adjust and extend them to your needs.
def compare_output_nu(df1, df2, mpl_backend=False):
nu_min = np.min([df1.min(), df2.min()])
nu_max = np.max([df1.max(), df2.max()])
if mpl_backend:
plt.figure(figsize=(14, 6))
plt.subplot(121)
plt.plot(df1, df2, ',')
plt.xlabel("output_nu, ref 1")
plt.ylabel("output_nu, ref 2")
plt.subplot(122)
plt.hist(df1, bins=np.linspace(nu_min, nu_max, 100), histtype="step", label="ref 1")
plt.hist(df2, bins=np.linspace(nu_min, nu_max, 100), histtype="step", label="ref 2")
plt.xlabel("output_nu")
plt.legend(frameon=False)
return
TOOLTIPS = [("(x,y)", "(@x, @y)")]
hover = HoverTool(tooltips=TOOLTIPS)
p = figure()
output_nu = ColumnDataSource(pd.DataFrame.from_records({'x': df1.values,
'y': df2.values}))
p.circle('x', 'y', size=1, source=output_nu)
p.xaxis.axis_label = "output_nu, ref 1"
p.yaxis.axis_label = "output_nu, ref 2"
p.xaxis.formatter.precision = 1
p.yaxis.formatter.precision = 1
p.add_tools(hover)
# Step lines are hacky way to make histograms with Bokeh
arr_hist1, edges1 = np.histogram(df1.values,
bins = 100,
range = [nu_min, nu_max])
arr_hist2, edges2 = np.histogram(df1.values,
bins = 100,
range = [nu_min, nu_max])
hist1 = ColumnDataSource(pd.DataFrame.from_records({'x': np.linspace(nu_min, nu_max, 100),
'y': arr_hist1}))
hist2 = ColumnDataSource(pd.DataFrame.from_records({'x': np.linspace(nu_min, nu_max, 100),
'y': arr_hist2}))
q = figure()
q.step('x', 'y', source=hist1, legend_label='ref 1')
q.step('x', 'y', source=hist2, legend_label='ref 2', color='#ff7f0e')
q.xaxis.axis_label = "output_nu"
q.xaxis.formatter.precision = 1
q.legend.click_policy="hide"
# Currently HoverTool does not work for step line glyph. See: https://github.com/bokeh/bokeh/issues/7419
q.add_tools(hover)
plot = gridplot([p, q], ncols=2, plot_width=420, plot_height=360)
show(plot)
def compare_spectrum(ref1_nu, ref1_L, ref2_nu, ref2_L, mpl_backend=False):
if mpl_backend:
plt.figure(figsize=(14, 6))
plt.subplot(121)
plt.plot(ref1_nu, ref1_L, label="ref 1")
plt.plot(ref2_nu, ref2_L, label="ref 2")
plt.xlabel("nu")
plt.ylabel("L")
plt.legend(frameon=False)
plt.subplot(122)
plt.plot(ref1_nu, ref1_L / ref2_L)
plt.xlabel("nu")
plt.ylabel("L ref 1 / L ref 2")
return
TOOLTIPS = [("(x,y)", "(@x, @y)")]
hover = HoverTool(tooltips=TOOLTIPS)
p = figure()
spectrum1 = ColumnDataSource(pd.DataFrame.from_records({'x': ref1_nu.values,
'y': ref1_L}))
spectrum2 = ColumnDataSource(pd.DataFrame.from_records({'x': ref2_nu.values,
'y': ref2_L}))
p.line('x', 'y', source=spectrum1, legend_label='ref 1')
p.line('x', 'y', source=spectrum2, legend_label='ref 2', color='#ff7f0e')
p.xaxis.axis_label = "L"
p.yaxis.axis_label = "nu"
p.xaxis.formatter.precision = 1
p.yaxis.formatter.precision = 1
p.legend.click_policy="hide"
p.add_tools(hover)
q = figure()
lum_ratio = ColumnDataSource(pd.DataFrame.from_records({'x': ref1_nu.values,
'y': ref1_L.values/ref2_L.values}))
q.circle('x', 'y', size=1, source=lum_ratio)
q.xaxis.axis_label = "nu"
q.yaxis.axis_label = "L ref 1 / L ref 2"
q.xaxis.formatter.precision = 1
q.yaxis.formatter.precision = 1
q.add_tools(hover)
plot = gridplot([p, q], ncols=2, plot_width=420, plot_height=360)
show(plot)
Get the data and find all the entries for which differences exist:
tmp1 = pd.HDFStore(comparer.ref1_fname, "r")
tmp2 = pd.HDFStore(comparer.ref2_fname, "r")
diff_entries = tt.loc[(tt["match"] == False) & (tt["exists_1"] == True) & (tt["exists_2"] == True)].index
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-2b40df7d3038> in <module> ----> 1 tmp1 = pd.HDFStore(comparer.ref1_fname, "r") 2 tmp2 = pd.HDFStore(comparer.ref2_fname, "r") 3 4 diff_entries = tt.loc[(tt["match"] == False) & (tt["exists_1"] == True) & (tt["exists_2"] == True)].index NameError: name 'comparer' is not defined
compare_output_nu(tmp1['/test_simulation/output_nu'], tmp2['/test_simulation/output_nu'])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-54d5fb0ce665> in <module> ----> 1 compare_output_nu(tmp1['/test_simulation/output_nu'], tmp2['/test_simulation/output_nu']) NameError: name 'tmp1' is not defined
compare_spectrum(tmp1['/test_runner_simple/spectrum/_frequency'][:-1],
tmp1['/test_runner_simple/spectrum/luminosity'],
tmp2['/test_runner_simple/spectrum/_frequency'][:-1],
tmp2['/test_runner_simple/spectrum/luminosity'])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-0257c581a562> in <module> ----> 1 compare_spectrum(tmp1['/test_runner_simple/spectrum/_frequency'][:-1], 2 tmp1['/test_runner_simple/spectrum/luminosity'], 3 tmp2['/test_runner_simple/spectrum/_frequency'][:-1], 4 tmp2['/test_runner_simple/spectrum/luminosity']) NameError: name 'tmp1' is not defined
comparer.teardown()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-2244ed23302f> in <module> ----> 1 comparer.teardown() NameError: name 'comparer' is not defined