.git/configure
to set up upstream
remote:[remote "upstream"]
url = <URL of GitHub repository or Azure mirror>
fetch = +refs/heads/*:refs/remotes/upstream/*
fetch = +refs/pull/*/head:refs/remotes/upstream/pr/*
Initialize LFS hooks and fetch references from upstream
:
$ git lfs install
$ git fetch upstream
This notebook uses the bokeh
plotting backend by default. Install it alongside your tardis
environment by doing:
$ conda activate tardis
$ conda install bokeh -c conda-forge --no-update-deps
tardis-refdata/notebooks
directory.ReferenceComparer object loads two versions of the reference data by passing at least one Git label (e.g. hash, tag, branch name). If either is set to None
it will just use the current data in the directory. For example:
comparer = ReferenceComparer(ref1_hash=None, ref2_hash='upstream/master')
compares unit_test_data.h5
from your local repository against the HEAD
of the upstream
remote.
Please set the labels you want to compare now:
REF1_HASH_DEFAULT = None
REF2_HASH_DEFAULT = 'upstream/master'
This feature is specially useful for CI pipelines: ref1_hash
and ref2_hash
parameters can be passed as environment variables before running the notebook, overwriting the defaults defined in the above cell.
export REF2_HASH='upstream/master'
If you want to switch to the matplotlib
backend pass the mpl_backend=True
option to the compare_output_nu
and compare_spectrum
function.
.teardown()
method to delete temporary files.import os
import shutil
import tempfile
import subprocess
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.layouts import gridplot
from bokeh.models.tools import HoverTool
from bokeh.io import output_notebook
output_notebook()
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) <ipython-input-1-9f363699fb7c> in <module> 6 import pandas as pd 7 import matplotlib.pyplot as plt ----> 8 from bokeh.plotting import figure, show, ColumnDataSource 9 from bokeh.layouts import gridplot 10 from bokeh.models.tools import HoverTool ModuleNotFoundError: No module named 'bokeh'
Make sure you are in the root of tardis-refdata
after running the following cell.
cd ..
/home/vsts/work/1/s/tardis-refdata
def highlight_missing(val):
if val == True:
return 'background-color: #BCF5A9'
else:
return 'background-color: #F5A9A9'
def highlight_relative_difference(val):
ret = 'background-color: #BCF5A9'
if val is None:
ret = 'background-color: #BCF5A9'
elif val > 1e-2:
ret = 'background-color: #F2F5A9'
elif val > 1e-1:
ret = 'background-color: #F5D0A9'
elif val > 1:
ret = 'background-color: #F5A9A9'
return ret
class ReferenceComparer(object):
def __init__(self, ref1_hash=None, ref2_hash=None, compare_path='unit_test_data.h5'):
assert not ((ref1_hash is None) and (ref2_hash is None)), "One hash can not be None"
self.ref1_hash = ref1_hash
self.ref2_hash = ref2_hash
self.compare_path = compare_path
self.tmp_dir = None
self.setup()
def setup(self):
self.tmp_dir = tempfile.mkdtemp()
print('Created temporary directory at {0}. Delete after use with .teardown'.format(self.tmp_dir))
for ref_id, ref_hash in enumerate([self.ref1_hash, self.ref2_hash]):
ref_id += 1
if ref_hash is not None:
self._copy_data_from_hash(ref_hash, 'ref{0}_'.format(ref_id))
else:
subprocess.Popen('cp {0} {1}'.format(self.compare_path,
os.path.join(self.tmp_dir,
'ref{0}_{1}'.format(ref_id, self.compare_path))),
shell=True)
setattr(self, 'ref{0}_fname'.format(ref_id),
os.path.join(self.tmp_dir, 'ref{0}_{1}'.format(ref_id, self.compare_path)))
def teardown(self):
shutil.rmtree(self.tmp_dir)
def _copy_data_from_hash(self, ref_hash, prefix):
git_cmd = ['git']
git_cmd.append('--work-tree={0}'.format(self.tmp_dir))
git_cmd += ['checkout', ref_hash, self.compare_path]
p = subprocess.Popen(git_cmd)
p.wait()
shutil.move(os.path.join(self.tmp_dir, self.compare_path),
os.path.join(self.tmp_dir, prefix + self.compare_path))
def generate_test_table(self):
rd1_hdfs = pd.HDFStore(self.ref1_fname, mode='r')
rd2_hdfs = pd.HDFStore(self.ref2_fname, mode='r')
rd1_keys = rd1_hdfs.keys()
rd2_keys = rd2_hdfs.keys()
rd1_hdfs.close()
rd2_hdfs.close()
rd1_df = pd.DataFrame(index=rd1_keys, columns=['exists'])
rd2_df = pd.DataFrame(index=rd2_keys, columns=['exists'])
rd1_df['exists'] = True
rd2_df['exists'] = True
joined_df = rd1_df.join(rd2_df, how='outer', lsuffix='_1', rsuffix='_2')
joined_df = joined_df.fillna(False)
return joined_df
def compare_refdata(self, test_table):
test_table['match'] = None
test_table['abs_diff_mean'] = None
test_table['abs_diff_max'] = None
test_table['rel_diff_mean'] = None
test_table['rel_diff_max'] = None
for row_id, row in test_table.iterrows():
if row[['exists_1', 'exists_2']].all():
ref1_df = pd.read_hdf(self.ref1_fname, row_id)
ref2_df = pd.read_hdf(self.ref2_fname, row_id)
if isinstance(ref1_df, pd.Series):
try:
pd.testing.assert_series_equal(ref1_df, ref2_df)
except AssertionError:
test_table.loc[row_id, 'match'] = False
abs_diff = np.fabs(ref1_df - ref2_df)
rel_diff = (abs_diff / np.fabs(ref1_df))[ref1_df != 0]
test_table.loc[row_id, 'abs_diff_mean'] = abs_diff.mean()
test_table.loc[row_id, 'abs_diff_max'] = abs_diff.max()
test_table.loc[row_id, 'rel_diff_mean'] = rel_diff.mean()
test_table.loc[row_id, 'rel_diff_max'] = rel_diff.max()
else:
test_table.loc[row_id, 'match'] = True
elif isinstance(ref1_df, pd.DataFrame):
try:
pd.testing.assert_frame_equal(ref1_df, ref2_df)
except AssertionError:
test_table.loc[row_id, 'match'] = False
abs_diff = np.fabs(ref1_df - ref2_df)
rel_diff = (abs_diff / np.fabs(ref1_df))[ref1_df != 0]
test_table.loc[row_id, 'abs_diff_mean'] = abs_diff.mean(skipna=True).mean()
test_table.loc[row_id, 'abs_diff_max'] = abs_diff.max(skipna=True).max()
test_table.loc[row_id, 'rel_diff_mean'] = rel_diff.mean(skipna=True).mean()
test_table.loc[row_id, 'rel_diff_max'] = rel_diff.max(skipna=True).max()
else:
test_table.loc[row_id, 'match'] = True
else:
raise ValueError('Needs to be a Series or DataFrame but is' + str(type(ref1_df)))
return test_table
Check if REF1_HASH
and REF2_HASH
are environment variables. If not defined (or empty), use the defaults defined above.
try:
REF1_HASH = os.environ['REF1_HASH']
if not REF1_HASH:
raise ValueError
except (KeyError, ValueError):
REF1_HASH = REF1_HASH_DEFAULT
try:
REF2_HASH = os.environ['REF2_HASH']
if not REF2_HASH:
raise ValueError
except (KeyError, ValueError):
REF2_HASH = REF2_HASH_DEFAULT
REF1_HASH, REF2_HASH
('b6e5238', 'ae743ac')
comparer = ReferenceComparer(ref1_hash=REF1_HASH, ref2_hash=REF2_HASH)
Created temporary directory at /tmp/tmp27baxs0n. Delete after use with .teardown
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) /usr/share/miniconda/envs/tardis/lib/python3.8/shutil.py in move(src, dst, copy_function) 790 try: --> 791 os.rename(src, real_dst) 792 except OSError: FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmp27baxs0n/unit_test_data.h5' -> '/tmp/tmp27baxs0n/ref1_unit_test_data.h5' During handling of the above exception, another exception occurred: FileNotFoundError Traceback (most recent call last) <ipython-input-1-7430fed7be7e> in <module> ----> 1 comparer = ReferenceComparer(ref1_hash=REF1_HASH, ref2_hash=REF2_HASH) <ipython-input-1-aa05e4877f43> in __init__(self, ref1_hash, ref2_hash, compare_path) 7 self.compare_path = compare_path 8 self.tmp_dir = None ----> 9 self.setup() 10 11 def setup(self): <ipython-input-1-aa05e4877f43> in setup(self) 15 ref_id += 1 16 if ref_hash is not None: ---> 17 self._copy_data_from_hash(ref_hash, 'ref{0}_'.format(ref_id)) 18 else: 19 subprocess.Popen('cp {0} {1}'.format(self.compare_path, <ipython-input-1-aa05e4877f43> in _copy_data_from_hash(self, ref_hash, prefix) 33 p = subprocess.Popen(git_cmd) 34 p.wait() ---> 35 shutil.move(os.path.join(self.tmp_dir, self.compare_path), 36 os.path.join(self.tmp_dir, prefix + self.compare_path)) 37 /usr/share/miniconda/envs/tardis/lib/python3.8/shutil.py in move(src, dst, copy_function) 803 rmtree(src) 804 else: --> 805 copy_function(src, real_dst) 806 os.unlink(src) 807 return real_dst /usr/share/miniconda/envs/tardis/lib/python3.8/shutil.py in copy2(src, dst, follow_symlinks) 433 if os.path.isdir(dst): 434 dst = os.path.join(dst, os.path.basename(src)) --> 435 copyfile(src, dst, follow_symlinks=follow_symlinks) 436 copystat(src, dst, follow_symlinks=follow_symlinks) 437 return dst /usr/share/miniconda/envs/tardis/lib/python3.8/shutil.py in copyfile(src, dst, follow_symlinks) 262 os.symlink(os.readlink(src), dst) 263 else: --> 264 with open(src, 'rb') as fsrc, open(dst, 'wb') as fdst: 265 # macOS 266 if _HAS_FCOPYFILE: FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmp27baxs0n/unit_test_data.h5'
tt = comparer.generate_test_table()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-8117ee489402> in <module> ----> 1 tt = comparer.generate_test_table() NameError: name 'comparer' is not defined
tt = comparer.compare_refdata(tt)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-197587a6f814> in <module> ----> 1 tt = comparer.compare_refdata(tt) NameError: name 'comparer' is not defined
tt[["exists_1", "exists_2", 'rel_diff_mean', 'rel_diff_max', 'match']].style.applymap(
highlight_missing, subset=['exists_1', 'exists_2', 'match']).applymap(
highlight_relative_difference, subset=['rel_diff_mean', 'rel_diff_max'])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-49edc6aa32a1> in <module> ----> 1 tt[["exists_1", "exists_2", 'rel_diff_mean', 'rel_diff_max', 'match']].style.applymap( 2 highlight_missing, subset=['exists_1', 'exists_2', 'match']).applymap( 3 highlight_relative_difference, subset=['rel_diff_mean', 'rel_diff_max']) NameError: name 'tt' is not defined
If parts of the reference data show differences between revisions, you should invest some time examining these differences in detail. Often, visualizing the relevant data blocks already helps.
You can use the following plotting routines as a blueprint and adjust and extend them to your needs.
def compare_output_nu(df1, df2, mpl_backend=False):
nu_min = np.min([df1.min(), df2.min()])
nu_max = np.max([df1.max(), df2.max()])
if mpl_backend:
plt.figure(figsize=(14, 6))
plt.subplot(121)
plt.plot(df1, df2, ',')
plt.xlabel("output_nu, ref 1")
plt.ylabel("output_nu, ref 2")
plt.subplot(122)
plt.hist(df1, bins=np.linspace(nu_min, nu_max, 100), histtype="step", label="ref 1")
plt.hist(df2, bins=np.linspace(nu_min, nu_max, 100), histtype="step", label="ref 2")
plt.xlabel("output_nu")
plt.legend(frameon=False)
return
TOOLTIPS = [("(x,y)", "(@x, @y)")]
hover = HoverTool(tooltips=TOOLTIPS)
p = figure()
output_nu = ColumnDataSource(pd.DataFrame.from_records({'x': df1.values,
'y': df2.values}))
p.circle('x', 'y', size=1, source=output_nu)
p.xaxis.axis_label = "output_nu, ref 1"
p.yaxis.axis_label = "output_nu, ref 2"
p.xaxis.formatter.precision = 1
p.yaxis.formatter.precision = 1
p.add_tools(hover)
# Step lines are hacky way to make histograms with Bokeh
arr_hist1, edges1 = np.histogram(df1.values,
bins = 100,
range = [nu_min, nu_max])
arr_hist2, edges2 = np.histogram(df1.values,
bins = 100,
range = [nu_min, nu_max])
hist1 = ColumnDataSource(pd.DataFrame.from_records({'x': np.linspace(nu_min, nu_max, 100),
'y': arr_hist1}))
hist2 = ColumnDataSource(pd.DataFrame.from_records({'x': np.linspace(nu_min, nu_max, 100),
'y': arr_hist2}))
q = figure()
q.step('x', 'y', source=hist1, legend_label='ref 1')
q.step('x', 'y', source=hist2, legend_label='ref 2', color='#ff7f0e')
q.xaxis.axis_label = "output_nu"
q.xaxis.formatter.precision = 1
q.legend.click_policy="hide"
# Currently HoverTool does not work for step line glyph. See: https://github.com/bokeh/bokeh/issues/7419
q.add_tools(hover)
plot = gridplot([p, q], ncols=2, plot_width=420, plot_height=360)
show(plot)
def compare_spectrum(ref1_nu, ref1_L, ref2_nu, ref2_L, mpl_backend=False):
if mpl_backend:
plt.figure(figsize=(14, 6))
plt.subplot(121)
plt.plot(ref1_nu, ref1_L, label="ref 1")
plt.plot(ref2_nu, ref2_L, label="ref 2")
plt.xlabel("nu")
plt.ylabel("L")
plt.legend(frameon=False)
plt.subplot(122)
plt.plot(ref1_nu, ref1_L / ref2_L)
plt.xlabel("nu")
plt.ylabel("L ref 1 / L ref 2")
return
TOOLTIPS = [("(x,y)", "(@x, @y)")]
hover = HoverTool(tooltips=TOOLTIPS)
p = figure()
spectrum1 = ColumnDataSource(pd.DataFrame.from_records({'x': ref1_nu.values,
'y': ref1_L}))
spectrum2 = ColumnDataSource(pd.DataFrame.from_records({'x': ref2_nu.values,
'y': ref2_L}))
p.line('x', 'y', source=spectrum1, legend_label='ref 1')
p.line('x', 'y', source=spectrum2, legend_label='ref 2', color='#ff7f0e')
p.xaxis.axis_label = "L"
p.yaxis.axis_label = "nu"
p.xaxis.formatter.precision = 1
p.yaxis.formatter.precision = 1
p.legend.click_policy="hide"
p.add_tools(hover)
q = figure()
lum_ratio = ColumnDataSource(pd.DataFrame.from_records({'x': ref1_nu.values,
'y': ref1_L.values/ref2_L.values}))
q.circle('x', 'y', size=1, source=lum_ratio)
q.xaxis.axis_label = "nu"
q.yaxis.axis_label = "L ref 1 / L ref 2"
q.xaxis.formatter.precision = 1
q.yaxis.formatter.precision = 1
q.add_tools(hover)
plot = gridplot([p, q], ncols=2, plot_width=420, plot_height=360)
show(plot)
Get the data and find all the entries for which differences exist:
tmp1 = pd.HDFStore(comparer.ref1_fname, "r")
tmp2 = pd.HDFStore(comparer.ref2_fname, "r")
diff_entries = tt.loc[(tt["match"] == False) & (tt["exists_1"] == True) & (tt["exists_2"] == True)].index
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-2b40df7d3038> in <module> ----> 1 tmp1 = pd.HDFStore(comparer.ref1_fname, "r") 2 tmp2 = pd.HDFStore(comparer.ref2_fname, "r") 3 4 diff_entries = tt.loc[(tt["match"] == False) & (tt["exists_1"] == True) & (tt["exists_2"] == True)].index NameError: name 'comparer' is not defined
compare_output_nu(tmp1['/test_simulation/output_nu'], tmp2['/test_simulation/output_nu'])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-54d5fb0ce665> in <module> ----> 1 compare_output_nu(tmp1['/test_simulation/output_nu'], tmp2['/test_simulation/output_nu']) NameError: name 'tmp1' is not defined
compare_spectrum(tmp1['/test_runner_simple/spectrum/_frequency'][:-1],
tmp1['/test_runner_simple/spectrum/luminosity'],
tmp2['/test_runner_simple/spectrum/_frequency'][:-1],
tmp2['/test_runner_simple/spectrum/luminosity'])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-0257c581a562> in <module> ----> 1 compare_spectrum(tmp1['/test_runner_simple/spectrum/_frequency'][:-1], 2 tmp1['/test_runner_simple/spectrum/luminosity'], 3 tmp2['/test_runner_simple/spectrum/_frequency'][:-1], 4 tmp2['/test_runner_simple/spectrum/luminosity']) NameError: name 'tmp1' is not defined
comparer.teardown()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-2244ed23302f> in <module> ----> 1 comparer.teardown() NameError: name 'comparer' is not defined