This notbook uses the bokeh
plotting backend by default. Install it alongside your tardis
environment by doing:
conda activate tardis
conda install bokeh -c conda-forge --no-update-deps
tardis-refdata/notebooks
directory.None
it will just use the current data in the directory other than that you can use any git label (e.g. hash, tag, branch name). For example:comparer = ReferenceComparer(ref2_hash='upstream/master')
matplotlib
backend pass the mpl_backend=True
option to the compare_output_nu
and compare_spectrum
function..teardown()
method to delete temporary files afterwards.import os
import shutil
import tempfile
import subprocess
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.layouts import gridplot
from bokeh.models.tools import HoverTool
from bokeh.io import output_notebook
output_notebook()
Ensure that you are in the root of tardis-refdata.
cd ..
/home/vsts/work/1/s/tardis-refdata
def highlight_missing(val):
if val == True:
return 'background-color: #BCF5A9'
else:
return 'background-color: #F5A9A9'
def highlight_relative_difference(val):
ret = 'background-color: #BCF5A9'
if val is None:
ret = 'background-color: #BCF5A9'
elif val > 1e-2:
ret = 'background-color: #F2F5A9'
elif val > 1e-1:
ret = 'background-color: #F5D0A9'
elif val > 1:
ret = 'background-color: #F5A9A9'
return ret
class ReferenceComparer(object):
def __init__(self, ref1_hash=None, ref2_hash=None, compare_path='unit_test_data.h5'):
assert not ((ref1_hash is None) and (ref2_hash is None)), "One hash can not be None"
self.ref1_hash = ref1_hash
self.ref2_hash = ref2_hash
self.compare_path = compare_path
self.tmp_dir = None
self.setup()
def setup(self):
self.tmp_dir = tempfile.mkdtemp()
print('Created temporary directory at {0}. Delete after use with .teardown'.format(self.tmp_dir))
for ref_id, ref_hash in enumerate([self.ref1_hash, self.ref2_hash]):
ref_id += 1
if ref_hash is not None:
self._copy_data_from_hash(ref_hash, 'ref{0}_'.format(ref_id))
else:
subprocess.Popen('cp {0} {1}'.format(self.compare_path,
os.path.join(self.tmp_dir,
'ref{0}_{1}'.format(ref_id, self.compare_path))),
shell=True)
setattr(self, 'ref{0}_fname'.format(ref_id),
os.path.join(self.tmp_dir, 'ref{0}_{1}'.format(ref_id, self.compare_path)))
def teardown(self):
shutil.rmtree(self.tmp_dir)
def _copy_data_from_hash(self, ref_hash, prefix):
git_cmd = ['git']
git_cmd.append('--work-tree={0}'.format(self.tmp_dir))
git_cmd += ['checkout', ref_hash, self.compare_path]
p = subprocess.Popen(git_cmd)
p.wait()
shutil.move(os.path.join(self.tmp_dir, self.compare_path),
os.path.join(self.tmp_dir, prefix + self.compare_path))
def generate_test_table(self):
rd1_hdfs = pd.HDFStore(self.ref1_fname, mode='r')
rd2_hdfs = pd.HDFStore(self.ref2_fname, mode='r')
rd1_keys = rd1_hdfs.keys()
rd2_keys = rd2_hdfs.keys()
rd1_hdfs.close()
rd2_hdfs.close()
rd1_df = pd.DataFrame(index=rd1_keys, columns=['exists'])
rd2_df = pd.DataFrame(index=rd2_keys, columns=['exists'])
rd1_df['exists'] = True
rd2_df['exists'] = True
joined_df = rd1_df.join(rd2_df, how='outer', lsuffix='_1', rsuffix='_2')
joined_df = joined_df.fillna(False)
return joined_df
def compare_refdata(self, test_table):
test_table['match'] = None
test_table['abs_diff_mean'] = None
test_table['abs_diff_max'] = None
test_table['rel_diff_mean'] = None
test_table['rel_diff_max'] = None
for row_id, row in test_table.iterrows():
if row[['exists_1', 'exists_2']].all():
ref1_df = pd.read_hdf(self.ref1_fname, row_id)
ref2_df = pd.read_hdf(self.ref2_fname, row_id)
if isinstance(ref1_df, pd.Series):
try:
pd.util.testing.assert_series_equal(ref1_df, ref2_df)
except AssertionError:
test_table.loc[row_id, 'match'] = False
abs_diff = np.fabs(ref1_df - ref2_df)
rel_diff = (abs_diff / np.fabs(ref1_df))[ref1_df != 0]
test_table.loc[row_id, 'abs_diff_mean'] = abs_diff.mean()
test_table.loc[row_id, 'abs_diff_max'] = abs_diff.max()
test_table.loc[row_id, 'rel_diff_mean'] = rel_diff.mean()
test_table.loc[row_id, 'rel_diff_max'] = rel_diff.max()
else:
test_table.loc[row_id, 'match'] = True
elif isinstance(ref1_df, pd.DataFrame):
try:
pd.util.testing.assert_frame_equal(ref1_df, ref2_df)
except AssertionError:
test_table.loc[row_id, 'match'] = False
abs_diff = np.fabs(ref1_df - ref2_df)
rel_diff = (abs_diff / np.fabs(ref1_df))[ref1_df != 0]
test_table.loc[row_id, 'abs_diff_mean'] = abs_diff.mean(skipna=True).mean()
test_table.loc[row_id, 'abs_diff_max'] = abs_diff.max(skipna=True).max()
test_table.loc[row_id, 'rel_diff_mean'] = rel_diff.mean(skipna=True).mean()
test_table.loc[row_id, 'rel_diff_max'] = rel_diff.max(skipna=True).max()
else:
test_table.loc[row_id, 'match'] = True
else:
raise ValueError('Needs to be a Series or DataFrame but is' + str(type(ref1_df)))
return test_table
comparer = ReferenceComparer(ref2_hash='upstream/master')
Created temporary directory at /tmp/tmp9mtdeep_. Delete after use with .teardown
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) /usr/share/miniconda/envs/tardis/lib/python3.8/shutil.py in move(src, dst, copy_function) 787 try: --> 788 os.rename(src, real_dst) 789 except OSError: FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmp9mtdeep_/unit_test_data.h5' -> '/tmp/tmp9mtdeep_/ref2_unit_test_data.h5' During handling of the above exception, another exception occurred: FileNotFoundError Traceback (most recent call last) <ipython-input-1-a49c68e1e4b6> in <module> ----> 1 comparer = ReferenceComparer(ref2_hash='upstream/master') <ipython-input-1-3cdb99616e04> in __init__(self, ref1_hash, ref2_hash, compare_path) 7 self.compare_path = compare_path 8 self.tmp_dir = None ----> 9 self.setup() 10 11 def setup(self): <ipython-input-1-3cdb99616e04> in setup(self) 15 ref_id += 1 16 if ref_hash is not None: ---> 17 self._copy_data_from_hash(ref_hash, 'ref{0}_'.format(ref_id)) 18 else: 19 subprocess.Popen('cp {0} {1}'.format(self.compare_path, <ipython-input-1-3cdb99616e04> in _copy_data_from_hash(self, ref_hash, prefix) 33 p = subprocess.Popen(git_cmd) 34 p.wait() ---> 35 shutil.move(os.path.join(self.tmp_dir, self.compare_path), 36 os.path.join(self.tmp_dir, prefix + self.compare_path)) 37 /usr/share/miniconda/envs/tardis/lib/python3.8/shutil.py in move(src, dst, copy_function) 800 rmtree(src) 801 else: --> 802 copy_function(src, real_dst) 803 os.unlink(src) 804 return real_dst /usr/share/miniconda/envs/tardis/lib/python3.8/shutil.py in copy2(src, dst, follow_symlinks) 430 if os.path.isdir(dst): 431 dst = os.path.join(dst, os.path.basename(src)) --> 432 copyfile(src, dst, follow_symlinks=follow_symlinks) 433 copystat(src, dst, follow_symlinks=follow_symlinks) 434 return dst /usr/share/miniconda/envs/tardis/lib/python3.8/shutil.py in copyfile(src, dst, follow_symlinks) 259 os.symlink(os.readlink(src), dst) 260 else: --> 261 with open(src, 'rb') as fsrc, open(dst, 'wb') as fdst: 262 # macOS 263 if _HAS_FCOPYFILE: FileNotFoundError: [Errno 2] No such file or directory: '/tmp/tmp9mtdeep_/unit_test_data.h5'
tt = comparer.generate_test_table()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-8117ee489402> in <module> ----> 1 tt = comparer.generate_test_table() NameError: name 'comparer' is not defined
tt = comparer.compare_refdata(tt)
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-197587a6f814> in <module> ----> 1 tt = comparer.compare_refdata(tt) NameError: name 'comparer' is not defined
tt[["exists_1", "exists_2", 'rel_diff_mean', 'rel_diff_max', 'match']].style.applymap(
highlight_missing, subset=['exists_1', 'exists_2', 'match']).applymap(
highlight_relative_difference, subset=['rel_diff_mean', 'rel_diff_max'])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-49edc6aa32a1> in <module> ----> 1 tt[["exists_1", "exists_2", 'rel_diff_mean', 'rel_diff_max', 'match']].style.applymap( 2 highlight_missing, subset=['exists_1', 'exists_2', 'match']).applymap( 3 highlight_relative_difference, subset=['rel_diff_mean', 'rel_diff_max']) NameError: name 'tt' is not defined
If parts of the reference data show differences between revisions, you should invest some time examining these differences in detail. Often, visualizing the relevant data blocks already helps.
You can use the following plotting routines as a blueprint and adjust and extend them to your needs.
def compare_output_nu(df1, df2, mpl_backend=False):
nu_min = np.min([df1.min(), df2.min()])
nu_max = np.max([df1.max(), df2.max()])
if mpl_backend:
plt.figure(figsize=(14, 6))
plt.subplot(121)
plt.plot(df1, df2, ',')
plt.xlabel("output_nu, ref 1")
plt.ylabel("output_nu, ref 2")
plt.subplot(122)
plt.hist(df1, bins=np.linspace(nu_min, nu_max, 100), histtype="step", label="ref 1")
plt.hist(df2, bins=np.linspace(nu_min, nu_max, 100), histtype="step", label="ref 2")
plt.xlabel("output_nu")
plt.legend(frameon=False)
return
TOOLTIPS = [("(x,y)", "(@x, @y)")]
hover = HoverTool(tooltips=TOOLTIPS)
p = figure()
output_nu = ColumnDataSource(pd.DataFrame.from_records({'x': df1.values,
'y': df2.values}))
p.circle('x', 'y', size=1, source=output_nu)
p.xaxis.axis_label = "output_nu, ref 1"
p.yaxis.axis_label = "output_nu, ref 2"
p.xaxis.formatter.precision = 1
p.yaxis.formatter.precision = 1
p.add_tools(hover)
# Step lines are hacky way to make histograms with Bokeh
arr_hist1, edges1 = np.histogram(df1.values,
bins = 100,
range = [nu_min, nu_max])
arr_hist2, edges2 = np.histogram(df1.values,
bins = 100,
range = [nu_min, nu_max])
hist1 = ColumnDataSource(pd.DataFrame.from_records({'x': np.linspace(nu_min, nu_max, 100),
'y': arr_hist1}))
hist2 = ColumnDataSource(pd.DataFrame.from_records({'x': np.linspace(nu_min, nu_max, 100),
'y': arr_hist2}))
q = figure()
q.step('x', 'y', source=hist1, legend_label='ref 1')
q.step('x', 'y', source=hist2, legend_label='ref 2', color='#ff7f0e')
q.xaxis.axis_label = "output_nu"
q.xaxis.formatter.precision = 1
q.legend.click_policy="hide"
# Hover is not working for step line glyph in Bokeh 1.4.0
q.add_tools(hover)
plot = gridplot([p, q], ncols=2, plot_width=420, plot_height=360)
show(plot)
def compare_spectrum(ref1_nu, ref1_L, ref2_nu, ref2_L, mpl_backend=False):
if mpl_backend:
plt.figure(figsize=(14, 6))
plt.subplot(121)
plt.plot(ref1_nu, ref1_L, label="ref 1")
plt.plot(ref2_nu, ref2_L, label="ref 2")
plt.xlabel("nu")
plt.ylabel("L")
plt.legend(frameon=False)
plt.subplot(122)
plt.plot(ref1_nu, ref1_L / ref2_L)
plt.xlabel("nu")
plt.ylabel("L ref 1 / L ref 2")
return
TOOLTIPS = [("(x,y)", "(@x, @y)")]
hover = HoverTool(tooltips=TOOLTIPS)
p = figure()
spectrum1 = ColumnDataSource(pd.DataFrame.from_records({'x': ref1_nu.values,
'y': ref1_L}))
spectrum2 = ColumnDataSource(pd.DataFrame.from_records({'x': ref2_nu.values,
'y': ref2_L}))
p.line('x', 'y', source=spectrum1, legend_label='ref 1')
p.line('x', 'y', source=spectrum2, legend_label='ref 2', color='#ff7f0e')
p.xaxis.axis_label = "L"
p.yaxis.axis_label = "nu"
p.xaxis.formatter.precision = 1
p.yaxis.formatter.precision = 1
p.legend.click_policy="hide"
p.add_tools(hover)
q = figure()
lum_ratio = ColumnDataSource(pd.DataFrame.from_records({'x': ref1_nu.values,
'y': ref1_L.values/ref2_L.values}))
q.circle('x', 'y', size=1, source=lum_ratio)
q.xaxis.axis_label = "nu"
q.yaxis.axis_label = "L ref 1 / L ref 2"
q.xaxis.formatter.precision = 1
q.yaxis.formatter.precision = 1
q.add_tools(hover)
plot = gridplot([p, q], ncols=2, plot_width=420, plot_height=360)
show(plot)
Get the data and find all the entries for which differences exist:
tmp1 = pd.HDFStore(comparer.ref1_fname, "r")
tmp2 = pd.HDFStore(comparer.ref2_fname, "r")
diff_entries = tt.loc[(tt["match"] == False) & (tt["exists_1"] == True) & (tt["exists_2"] == True)].index
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-2b40df7d3038> in <module> ----> 1 tmp1 = pd.HDFStore(comparer.ref1_fname, "r") 2 tmp2 = pd.HDFStore(comparer.ref2_fname, "r") 3 4 diff_entries = tt.loc[(tt["match"] == False) & (tt["exists_1"] == True) & (tt["exists_2"] == True)].index NameError: name 'comparer' is not defined
compare_output_nu(tmp1['/test_simulation/output_nu'], tmp2['/test_simulation/output_nu'])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-54d5fb0ce665> in <module> ----> 1 compare_output_nu(tmp1['/test_simulation/output_nu'], tmp2['/test_simulation/output_nu']) NameError: name 'tmp1' is not defined
compare_spectrum(tmp1['/test_runner_simple/spectrum/_frequency'][:-1],
tmp1['/test_runner_simple/spectrum/luminosity'],
tmp2['/test_runner_simple/spectrum/_frequency'][:-1],
tmp2['/test_runner_simple/spectrum/luminosity'])
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-0257c581a562> in <module> ----> 1 compare_spectrum(tmp1['/test_runner_simple/spectrum/_frequency'][:-1], 2 tmp1['/test_runner_simple/spectrum/luminosity'], 3 tmp2['/test_runner_simple/spectrum/_frequency'][:-1], 4 tmp2['/test_runner_simple/spectrum/luminosity']) NameError: name 'tmp1' is not defined
comparer.teardown()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-2244ed23302f> in <module> ----> 1 comparer.teardown() NameError: name 'comparer' is not defined