Plots from the “Chain of Trust” paper#

from ast import literal_eval
from pathlib import Path

import matplotlib
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import lines
from notebooks.fixed_sankey_plot import sankey
from sklearn import metrics

from sec_certs.dataset import CCDataset

# LaTeX plotting
matplotlib.use("pgf")
sns.set_palette("Set2")
sns.set_context("paper")

plt.rcParams["pgf.texsystem"] = "pdflatex"
plt.rcParams["font.family"] = "serif"
plt.rcParams["text.usetex"] = True
plt.rcParams["pgf.rcfonts"] = False

plt.rcParams["axes.linewidth"] = 0.5
plt.rcParams["axes.labelsize"] = 14

plt.rcParams["xtick.labelsize"] = 12
plt.rcParams["xtick.bottom"] = True
plt.rcParams["xtick.major.size"] = 5
plt.rcParams["xtick.major.width"] = 0.5
plt.rcParams["xtick.major.pad"] = 0.1

plt.rcParams["ytick.labelsize"] = 12
plt.rcParams["ytick.left"] = True
plt.rcParams["ytick.major.size"] = 5
plt.rcParams["ytick.major.width"] = 0.5
plt.rcParams["ytick.major.pad"] = 0.1

plt.rcParams["legend.title_fontsize"] = 10
plt.rcParams["legend.fontsize"] = 10
plt.rcParams["legend.handletextpad"] = 0.3
plt.rcParams["lines.markersize"] = 0.5
plt.rcParams["savefig.pad_inches"] = 0.01

INPUT_DIR = Path("./paper_artifacts/chain_of_trust/data/plots/")
OUTPUT_DIR = Path("./results/figures/")
DATASET_PATH = Path("./dataset/cc_november_23/dataset.json")
INPUT_DIR.mkdir(exist_ok=True, parents=True)

Ecosystem insights plots#

figure_width = 3.5
figure_height = 2.5

dset = CCDataset.from_web_latest()
df = dset.to_pandas()
# Validity boxplot
df_validity = pd.read_csv(INPUT_DIR / "df_validity.csv")
box = sns.boxplot(data=df_validity, x="year_from", y="validity_period", linewidth=0.75, flierprops={"marker": "x"})
box.set(xlabel="", ylabel="")
plt.xticks([3 * i for i in range(10)], [3 * i + 1997 for i in range(10)])
# rotate ticks by 45 degrees
plt.xticks(rotation=45)

fig = matplotlib.pyplot.gcf()
fig.set_size_inches(figure_width, figure_height)
fig.tight_layout(pad=0.1)
fig.savefig(OUTPUT_DIR / "boxplot_validity.pdf", bbox_inches="tight", dpi=300)
# Average EAL level

avg_levels = pd.read_csv(INPUT_DIR / "avg_eal.csv").loc[lambda _df: _df.year_from < 2025]
eal_to_num_mapping = {eal: index for index, eal in enumerate(df["eal"].cat.categories)}
avg_levels["smartcard_category"] = avg_levels.category.map(
    lambda x: x if x == "ICs, Smartcards" else "Other categories"
)
line = sns.lineplot(
    data=avg_levels,
    x="year_from",
    y="eal_number",
    hue="smartcard_category",
    errorbar=None,
    style="smartcard_category",
    markers=True,
    linewidth=2,
)
line.set(xlabel=None, ylabel=None, title=None, xlim=(1999.6, 2023))
ymin = 1
ymax = 9
ylabels = [
    x if "+" in x else x + r"\phantom{+}" for x in list(eal_to_num_mapping.keys())[ymin : ymax + 1]
]  # this also aligns the labels by adding phantom spaces
plt.yticks(range(ymin, ymax + 1), ylabels)
plt.xticks([3 * i + 1997 for i in range(9)] + [2023])
plt.xticks(rotation=45)
line.legend(title=None, labels=avg_levels.smartcard_category.unique())
plt.legend(frameon=False)

fig = matplotlib.pyplot.gcf()
fig.set_size_inches(figure_width, figure_height)
fig.tight_layout(pad=0.1)
fig.savefig(OUTPUT_DIR / "temporal_trends_categories.pdf", dpi=300)
plt.show()
plt.close()

Interesting schemes#

interesting_schemes = pd.read_csv(INPUT_DIR / "interesting_schemes.csv")

line = sns.lineplot(
    data=interesting_schemes,
    x="year_from",
    y="size",
    hue="scheme",
    style="scheme",
    markers=True,
    dashes=True,
    linewidth=2,
)
line.set(xlabel=None, ylabel=None, title=None, xlim=(1999.6, 2023), ylim=(0, 100))
plt.xticks([3 * i + 1997 for i in range(9)] + [2023])
plt.xticks(rotation=45)
line.legend(title=None)
fig = matplotlib.pyplot.gcf()
fig.set_size_inches(figure_width, figure_height)
fig.tight_layout(pad=0.1)
fig.savefig(OUTPUT_DIR / "temporal_trends_schemes.pdf", dpi=300)
plt.show()
plt.close()

Average number of transitive references over time#

df_to_plot = pd.read_csv(INPUT_DIR / "avg_refs_over_time.csv", parse_dates=["date"])
df_to_plot["category"] = df_to_plot["category"].map(lambda x: "others" if x == "others categories" else x)

plt.figure()
g = sns.lineplot(data=df_to_plot, x="date", y="n_references", hue="category", errorbar=None)
plt.legend(frameon=True, handlelength=2, title="Product category")
g.set_xlabel("")
g.set_ylabel("Avg. \# transitive refs.")

dtFmt = mdates.DateFormatter("%Y")
g.xaxis.set_major_formatter(dtFmt)
g.set_xticks(
    [
        pd.to_datetime("1998-01-01"),
        pd.to_datetime("2003-01-01"),
        pd.to_datetime("2008-01-01"),
        pd.to_datetime("2013-01-01"),
        pd.to_datetime("2018-01-01"),
        pd.to_datetime("2023-01-01"),
    ]
)
g.figure.set_size_inches(3.9, 3)
plt.tight_layout(pad=0.1)
g.figure.savefig(OUTPUT_DIR / "lineplot_avg_refs.pdf")
g.figure.show()

Average reach over time#

df_to_plot = pd.read_csv(INPUT_DIR / "avg_reach_over_time.csv", parse_dates=["date"])

plt.figure()
g = sns.lineplot(data=df_to_plot, x="date", y="n_references", hue="category", errorbar=None)
plt.legend(frameon=True, handlelength=2, title="Product category")
g.set_xlabel("")
g.set_ylabel("Average certificate reach")
dtFmt = mdates.DateFormatter("%Y")
g.xaxis.set_major_formatter(dtFmt)
g.set_xticks(
    [
        pd.to_datetime("1998-01-01"),
        pd.to_datetime("2003-01-01"),
        pd.to_datetime("2008-01-01"),
        pd.to_datetime("2013-01-01"),
        pd.to_datetime("2018-01-01"),
        pd.to_datetime("2023-01-01"),
    ]
)

g.figure.set_size_inches(3.9, 3)
plt.tight_layout(pad=0.1)
g.figure.savefig(OUTPUT_DIR / "lineplot_avg_reach.pdf")
g.figure.show()

Area under curve#

plt.figure(figsize=(2.8, 1.8))
sns.set_palette("Set2")
colors = plt.cm.Dark2(np.linspace(0, 1, 8))

df_sent = pd.read_csv(INPUT_DIR / "df_pred_sentence_transformers.csv")
df_tf_idf = pd.read_csv(INPUT_DIR / "df_pred_tf_idf.csv")
df_baseline = pd.read_csv(INPUT_DIR / "df_pred_baseline.csv")

fpr, tpr, thresholds = metrics.roc_curve(df_sent.y_true, df_sent.y_pred)
auc = metrics.roc_auc_score(df_sent.y_true, df_sent.y_pred)
plt.plot(fpr, tpr, label=f"Sent. trans. (AUC={auc:.2f})", color=colors[0])

fpr, tpr, thresholds = metrics.roc_curve(df_tf_idf.y_true, df_tf_idf.y_pred)
auc = metrics.roc_auc_score(df_tf_idf.y_true, df_tf_idf.y_pred)
plt.plot(fpr, tpr, label=f"TF-IDF (AUC={auc:.2f})", color=colors[1])

fpr, tpr, thresholds = metrics.roc_curve(df_baseline.y_true, df_baseline.y_pred)
auc = metrics.roc_auc_score(df_baseline.y_true, df_baseline.y_pred)
with plt.rc_context({"legend.fontsize": 8}):
    plt.plot(fpr, tpr, label=f"Random guess (AUC={auc:.2f})", color=colors[2])

    plt.legend(loc="lower right")
    plt.savefig(OUTPUT_DIR / "roc_auc.pdf")
    plt.show()

Stack-bar plot of annotations in categories#

df = pd.read_csv(INPUT_DIR / "ref_categories_stackplot.csv")

ax = df.plot.barh(stacked=True, rot=0, width=0.95)
ax.set_ylim(-0.6, 2.6)
ax.set_xlabel("\# references", fontsize=12)
ax.set_yticklabels(["Others", "Smartcard-related", "Smartcards"])
ax.legend(title="Reference context", loc="lower right", frameon=True)

plt.text(0.4, 0.8, df.iloc[2]["Component reuse"], transform=ax.transAxes, color="white", fontsize=14)
plt.text(0.81, 0.8, df.iloc[2]["Predecessor"], transform=ax.transAxes, color="white", fontsize=14)

plt.axhline(y=1.21, xmin=0.05, xmax=0.18, color="black", linewidth=0.75)
plt.axhline(y=0.9, xmin=0.12, xmax=0.18, color="black", linewidth=0.75)
plt.text(0.2, 0.55, df.iloc[1]["Component reuse"], transform=ax.transAxes, color="black", fontsize=14)
plt.text(0.2, 0.45, df.iloc[1]["Predecessor"], transform=ax.transAxes, color="black", fontsize=14)

plt.axhline(y=0.17, xmin=0.02, xmax=0.1, color="black", linewidth=0.75)
plt.axhline(y=-0.1, xmin=0.05, xmax=0.1, color="black", linewidth=0.75)
plt.text(0.12, 0.22, df.iloc[0]["Component reuse"], transform=ax.transAxes, color="black", fontsize=14)
plt.text(0.12, 0.13, df.iloc[0]["Predecessor"], transform=ax.transAxes, color="black", fontsize=14)

ax.figure.set_size_inches(4, 3)
plt.tight_layout(pad=0.1)
plt.savefig(OUTPUT_DIR / "stacked_barplot.pdf")

Archived certificate half-life#

plt.figure()

df = pd.read_csv(INPUT_DIR / "archived_half_life.csv")

with plt.rc_context({"legend.fontsize": 10, "legend.title_fontsize": 10}):
    g = sns.ecdfplot(data=df.n_days, complementary=True)

    plt.axvline(x=365, color="r", linestyle="--", linewidth=0.75)
    vertical_line = lines.Line2D(
        [], [], color="r", marker="", linestyle="--", markersize=10, markeredgewidth=1.5, label="One year"
    )
    plt.legend(handles=[vertical_line])

    g.figure.set_size_inches(3, 2)
    g.set_xlim(0, 2000)

    g.set_xlabel("Number of days")
    g.set_ylabel("Proportion")

    g.yaxis.set_major_formatter(matplotlib.ticker.PercentFormatter(xmax=1))
    g.set_yticks([0, 0.25, 0.5, 0.75, 1])

    plt.tight_layout(pad=0.05)
    g.figure.savefig(OUTPUT_DIR / "cdf_half_life.pdf")
    g.figure.show()

Age of referenced certificate in composite-evaluation products#

plt.figure()

df = pd.read_csv(INPUT_DIR / "ecdf_archival_data.csv")
df = df.loc[df.scheme.isin({"FR", "DE", "NL"})]

with plt.rc_context({"legend.fontsize": 10, "legend.title_fontsize": 10}):
    g = sns.ecdfplot(data=df, x="date_diff", hue="scheme", legend=True)
    plt.axvline(x=540, color="r", linestyle="--", linewidth=0.75)

    vertical_line = lines.Line2D([], [], color="r", linestyle="--", markersize=10, label="18 months")
    unique_hues = df["scheme"].unique()
    handles = [
        plt.Line2D([], [], color=g.lines[color_idx].get_color(), label=label)
        for color_idx, label in enumerate(unique_hues)
    ]

    handles.append(vertical_line)
    labels = list(unique_hues) + ["18 months"]

    g.legend(handles=handles, labels=labels)

    g.figure.set_size_inches(3, 2)
    g.yaxis.set_major_formatter(matplotlib.ticker.PercentFormatter(xmax=1))
    g.set_yticks([0, 0.25, 0.5, 0.75, 1])
    g.set_xlim(0, 2000)
    g.set_xlabel("Number of days")
    g.set_ylabel("Proportion")
    plt.tight_layout(pad=0.05)
    g.figure.savefig(OUTPUT_DIR / "ref_comp_age.pdf")
    plt.show()

Sankey diagram#

df = pd.read_csv(INPUT_DIR / "sankey_scheme_data.csv").assign(refs=lambda df_: df_.refs.map(literal_eval))

cert_id_to_scheme_mapping = dict(zip(df.cert_id, df.scheme))
exploded = (
    df.copy()
    .loc[lambda df_: df_.refs.notnull(), ["scheme", "refs"]]
    .explode("refs")
    .assign(ref_scheme=lambda df_: df_.refs.map(cert_id_to_scheme_mapping))
    .loc[lambda df_: df_.ref_scheme.notnull()]
)

all_schemes = set(exploded.scheme.unique()) | set(exploded.ref_scheme.unique())
colors = list(sns.color_palette("hls", len(all_schemes), as_cmap=False).as_hex())
color_dict = dict(zip(all_schemes, colors))

plt.figure()
figure, axes = plt.subplots(1, 1)
figure.set_size_inches(4, 4)
figure.set_tight_layout(True)

sankey(
    exploded.scheme,
    exploded.ref_scheme,
    colorDict=color_dict,
    leftLabels=list(exploded.scheme.unique()),
    rightLabels=list(exploded.ref_scheme.unique()),
    fontsize=7,
    ax=axes,
)

figure.savefig(str(OUTPUT_DIR / "scheme_references.pdf"), bbox_inches="tight")
plt.show()