Vulnerability analysis#

This notebook contains analysis of vulnerabilities in Common Criteria certificates.

import itertools

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm

from sec_certs.cert_rules import cc_rules

sns.set_style("whitegrid")
sns.set_palette("deep")
sns.set_context("notebook")  # Set to "paper" for use in paper :)

import itertools
import warnings
from pathlib import Path
import tempfile
from sec_certs.dataset import CCDataset, CCDatasetMaintenanceUpdates, CVEDataset, CPEDataset
from sec_certs.utils.pandas import (
    compute_cve_correlations,
    compute_maintenances_that_come_after_vulns,
    discover_sar_families,
    expand_df_with_cve_cols,
    filter_to_cves_within_validity_period,
    get_sar_level_from_set,
    get_top_n_cwes,
    move_fixing_mu_to_directory,
    prepare_cwe_df,
)

warnings.simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

RESULTS_DIR = Path("./results")
RESULTS_DIR.mkdir(exist_ok=True)

Retrieve necessary objects from sec-certs#

# Local instantiation

print("Loading CCDataset")
dset: CCDataset = CCDataset.from_json("/path/to/cc_dset.json")

print("Loading CCDatasetMaintenanceUpdates")
main_dset: CCDatasetMaintenanceUpdates = CCDatasetMaintenanceUpdates.from_json("/path/to/maintenance_updates.json")

print("Loading CVEDataset")
cve_dset: CVEDataset = CVEDataset.from_json("/path/to/cve_dataset.json")

print("Loading CPEDataset")
cpe_dset: CPEDataset = CPEDataset.from_json("/path/to/cpe_dataset.json")

# # Remote instantiation (takes approx. 10 minutes to complete)
# dset: CCDataset = CCDataset.from_web_latest(path="dset", auxiliary_datasets=True)

# print("Downloading dataset of maintenance updates")
# main_dset: CCDatasetMaintenanceUpdates = CCDatasetMaintenanceUpdates.from_web_latest()

# print("Downloading CPE dataset")
# cpe_dset: CPEDataset = dset.auxiliary_datasets.cpe_dset

# print("Downloading CVE dataset")
# cve_dset: CVEDataset = dset.auxiliary_datasets.cve_dset

Dataset preprocessing#

  • Loads CCDataset into pandas DataFrame

  • Expands it with various maintenance-related columns

  • Expands it with various CVE-related columns

df = dset.to_pandas()

if "n_maintenances" not in df.columns:
    n_maintenances = main_dset.get_n_maintenances_df()
    df = pd.concat([df, n_maintenances], axis="columns")

if "maintenance_dates" not in df.columns:
    main_dates = main_dset.get_maintenance_dates_df()
    df = pd.concat([df, main_dates], axis="columns")

# Expand DataFrame with CVEs that affect some certificate
cves = list(itertools.chain.from_iterable(x.heuristics.related_cves for x in dset if x.heuristics.related_cves))
cve_dict = {x: cve_dset[x] for x in cves}
cve_dset.cves = cve_dict  # Limit cve_dset to CVEs relevant to some certificate

df = expand_df_with_cve_cols(df, cve_dset)

df = df.loc[(df.not_valid_before.notnull()) & (df.not_valid_after.notnull())]

df_cves_within_validity_period = filter_to_cves_within_validity_period(
    df.copy(), cve_dset
)
df_cves_within_validity_period = expand_df_with_cve_cols(df_cves_within_validity_period, cve_dset)

df_cpe_rich = df.loc[~df.cpe_matches.isnull()].copy()
df_cve_rich = df.loc[df.n_cves > 0].copy()
# Take a look at columns related to CVEs
df.loc[
    ~df.related_cves.isna(), ["related_cves", "cve_published_dates", "earliest_cve", "worst_cve_score", "avg_cve_score"]
]

Plots#

  • distribution of categories in CPE-rich vs. all certificates

  • distribution of years in CPE-rich, CVE-rich, and all certificates

  • distribution of security levels in CPE-rich, CVE-rich and all certificates

  • distribution of most common (top 20) vendors in CPE-rich, CVE-rich and all certificates

# distribution of categories in CPE-rich vs. all certificates
categories_cpe = df_cpe_rich.category.value_counts().sort_index().rename("Category distribution CPE-rich")
categories_cve = df_cve_rich.category.value_counts().sort_index().rename("Category distribution CVE-rich")
categories_all = df.category.value_counts().sort_index().rename("Category distribution all")

categories_merged = pd.concat([categories_all, categories_cpe, categories_cve], axis=1)
categories_merged = categories_merged.div(categories_merged.sum(axis=0), axis=1)

categories_merged.plot.bar(
    title="Categories (without smartcards) comparison between CPE-rich, CVE-rich and all certificates"
)
plt.show()
plt.savefig(RESULTS_DIR / "categories.pdf", bbox_inches="tight")

# distribution of years in CPE-rich, CVE-rich, and all certificates
years_cpe = df_cpe_rich.year_from.value_counts().sort_index().rename("Year distribution CPE-rich")
years_cve = df_cve_rich.year_from.value_counts().sort_index().rename("Year distribution CVE-rich")
years_all = df.year_from.value_counts().sort_index().rename("Year distribution all certificates")

years_merged = pd.concat([years_all, years_cpe, years_cve], axis=1)
years_merged.index.name = "year_from"
years_merged = years_merged.loc[years_merged.index < 2022]
years_merged = years_merged.div(years_merged.sum(axis=0), axis=1)
years_merged.plot.line(title="Years comparision between CPE-rich, CVE-rich and all certificates")

plt.show()
plt.savefig(RESULTS_DIR / "cve_cpe_certs_time_evolution.pdf", bbox_inches="tight")

# distribution of security levels in CPE-rich, CVE-rich and all certificates
levels_cpe = df_cpe_rich.eal.value_counts().sort_index().rename("EAL distribution CPE-rich")
levels_cve = df_cve_rich.eal.value_counts().sort_index().rename("EAL distribution CVE-rich")
levels_all = df.eal.value_counts().sort_index().rename("EAL distribution all certificates")

levels_merged = pd.concat([levels_all, levels_cpe, levels_cve], axis=1)
levels_merged = levels_merged.div(levels_merged.sum(axis=0), axis=1)
levels_merged.plot.bar(title="EAL comparision between CPE-rich, CVE-rich and all certificates")
plt.show()
plt.savefig(RESULTS_DIR / "eal_distr_all_cpes_cves.pdf", bbox_inches="tight")

# distribution of most common (top 20) vendors in CPE-rich, CVE-rich and all certificates
top_manufacturers = list(df.manufacturer.value_counts().head(20).index)
vendors_cpe = (
    df_cpe_rich.loc[df_cpe_rich.manufacturer.isin(top_manufacturers), "manufacturer"]
    .value_counts()
    .sort_index()
    .rename("Common vendors distribution CPE-rich")
)
vendors_cve = (
    df_cve_rich.loc[df_cve_rich.manufacturer.isin(top_manufacturers), "manufacturer"]
    .value_counts()
    .sort_index()
    .rename("Common vendors distribution CVE-rich")
)
vendors_all = (
    df.loc[df.manufacturer.isin(top_manufacturers), "manufacturer"]
    .value_counts()
    .sort_index()
    .rename("Common vendors distribution all certificates")
)

vendors_merged = pd.concat([vendors_all, vendors_cpe, vendors_cve], axis=1)
vendors_merged = vendors_merged.div(vendors_merged.sum(axis=0), axis=1)
vendors_merged.plot.bar(title="Common vendors comparison between CPE-rich, CVE-rich and all certificates")
plt.show()
plt.savefig(RESULTS_DIR / "top_vendors_vulns.pdf", bbox_inches="tight")

Correlations between (EAL, SAR) and CVEs#

# This limits analysis to SAR families that are popular-enough (>100 CVE-rich certs, second-most-popular value with >= 40 instances)
cve_rich = df_cves_within_validity_period.loc[df_cves_within_validity_period.related_cves.notnull()].copy()
families = discover_sar_families(cve_rich.extracted_sars)

for family in tqdm(families):
    cve_rich[family] = cve_rich.extracted_sars.map(lambda x: get_sar_level_from_set(x, family))

sars = cve_rich[families].notnull().sum().to_frame("support")
sars.index.name = "family"
sars["second_most_popular"] = sars.index.map(
    lambda x: 0 if not len(cve_rich[x].value_counts()) > 1 else cve_rich[x].value_counts().iloc[1]
)
sars = sars.loc[(sars.support >= 100) & (sars.second_most_popular >= 40)].sort_values(by="support", ascending=False)
sars["range"] = sars.index.map(lambda x: len(cve_rich[x].value_counts()))

df_corr = compute_cve_correlations(
    df_cves_within_validity_period.loc[
        df_cves_within_validity_period.category != "ICs, Smart Cards and Smart Card-Related Devices and Systems"
    ],
    sar_families=sars.index.tolist(),
    filter_nans=False,
)
df_corr["range"] = sars["range"]
df_corr.loc["eal", "range"] = len(cve_rich.eal.cat.categories)
df_corr = (
    df_corr.sort_values(by="avg_cve_score_corr")
    .rename(index={"eal": "EAL"})
    .drop(columns=["worst_cve_score_corr", "worst_cve_pvalue"])
)
df_corr = pd.concat([df_corr.loc["EAL", :].to_frame().transpose(), df_corr.drop("EAL")])
df_corr.range = df_corr.range.astype("int")
df_corr.support = df_corr.support.astype("int")
df_corr.index.name = "EAL/SAR family"
df_corr.index = df_corr.index.map(lambda x: x.replace("_", "\_"))
df_corr.to_csv(RESULTS_DIR / "certs_without_smartcards_sar_cve_corr.csv", float_format="%.2e")
df_cves_within_validity_period

Time from CVE to maintenance update#

The following block computes time from CVE to maintenance update of a certificate. Though, it has some limitations:

  • So far we only watch for first CVE related to a certified product (though most of certificates have only one maintenance anyway, if they even have one)

  • We assume that a maintance update released after (by date comparison) CVE may be related to that vulnerability

  • The sample size is small. Only ~40 certificates have at least one CVE and at least one Maintenance update. Out of those, only ~30 certificates have maintenance update that comes after a CVE

df_main_after_vuln = compute_maintenances_that_come_after_vulns(df_cves_within_validity_period)

n_vuln_certs_wo_maintenances = df_cves_within_validity_period.loc[
    (df_cves_within_validity_period.n_cves > 0) & (df_cves_within_validity_period.n_maintenances == 0)
].shape[0]
n_vuln_certs_w_maintenances = df_cves_within_validity_period.loc[
    (df_cves_within_validity_period.n_cves > 0) & (df_cves_within_validity_period.n_maintenances > 0)
].shape[0]
n_certs_with_main_after_vuln = df_main_after_vuln.loc[df_main_after_vuln.earliest_maintenance_after_vuln.notna()].shape[
    0
]

print(
    f"Number of certificates with >0 CVEs in validity period but 0 maintenance reports: {n_vuln_certs_wo_maintenances}"
)
print(
    f"Number of certificates with >0 CVEs in validity period and >0 maintenance reports: {n_vuln_certs_w_maintenances}"
)
print(f"Number of vulnerable certificates with a maintenance report *after* CVE: {n_certs_with_main_after_vuln}")

updates_that_should_fix_vulns_path = RESULTS_DIR / "updates_that_should_fix_vulns"
updates_that_should_fix_vulns_path.mkdir(exist_ok=True)
mu_filenames = move_fixing_mu_to_directory(
    df_main_after_vuln,
    main_dset.to_pandas(),
    updates_that_should_fix_vulns_path,
    "/Users/adam/phd/projects/certificates/sec-certs/datasets/cc_final_run_may_23/auxiliary_datasets/maintenances/reports/pdf",
)
df_main_after_vuln.loc[
    df_main_after_vuln.earliest_maintenance_after_vuln.notna(),
    [
        "name",
        "cpe_matches",
        "related_cves",
        "not_valid_before",
        "not_valid_after",
        "cve_published_dates",
        "maintenance_dates",
        "earliest_maintenance_after_vuln",
    ],
].to_csv(RESULTS_DIR / "vulns_before_mu.csv")
df_main_after_vuln.loc[
    df_main_after_vuln.earliest_maintenance_after_vuln.notna(),
    ["maintenance_dates", "earliest_cve", "earliest_maintenance_after_vuln"],
]

Histogram timeline of new vulnerabilities#

Shows when vulnerabilities are announced in relation to the date of certification & date of certificate expiration.

Note: Some certificates (especially the new ones) don’t have their expiration date set yet. These are discarded from the analysis.

exploded_cves = (
    df_cve_rich.explode("related_cves")
    .loc[:, ["related_cves", "not_valid_before", "not_valid_after"]]
    .rename(columns={"related_cves": "cve"})
)
exploded_cves = exploded_cves.dropna()
exploded_cves["cve_published_date"] = exploded_cves.cve.map(lambda x: cve_dset[x].published_date)

exploded_cves.not_valid_before = exploded_cves.not_valid_before.dt.normalize()
exploded_cves.not_valid_after = exploded_cves.not_valid_after.dt.normalize()
exploded_cves.cve_published_date = exploded_cves.cve_published_date.dt.tz_localize(None).dt.normalize()

exploded_cves["n_days_after_certification"] = (
    exploded_cves.cve_published_date - exploded_cves.not_valid_before
).dt.days
exploded_cves["n_days_after_expiry"] = (exploded_cves.cve_published_date - exploded_cves.not_valid_after).dt.days

plt.rcParams["figure.figsize"] = [12, 4]
plt.rcParams["figure.autolayout"] = True
figure, axes = plt.subplots(1, 2)

exploded_cves.to_csv(RESULTS_DIR / "exploded_cves.csv")

hist = sns.histplot(exploded_cves.n_days_after_certification, kde=True, ax=axes[0])
hist.set(
    xlim=(-4000, 4600),
    title="CVEs appearing n days after certification",
    xlabel="Number of days after date of certification",
    ylabel="Frequency of CVEs",
)
hist.axvline(0, color="red", linewidth="1", label="Day of certification")
hist.legend(loc="upper right")
# plt.savefig(RESULTS_DIR / "cves_n_days_after_certification.pdf", bbox_inches='tight')
# plt.show()

hist = sns.histplot(exploded_cves.n_days_after_expiry, kde=True, ax=axes[1])
hist.set(
    xlim=(-6200, 4000),
    title="CVEs appearing n days after certificate expiry date",
    xlabel="Number of days after certificate expiration",
    ylabel="Frequency of CVEs",
)
hist.axvline(0, color="red", linewidth="1", label="Day of expiration")
hist.legend(loc="upper left")
plt.savefig(RESULTS_DIR / "cves_vs_certificate_lifetime.pdf", bbox_inches="tight")
plt.show()

n_cves = exploded_cves.shape[0]
ratio_before_cert = exploded_cves.loc[exploded_cves.n_days_after_certification <= 0].shape[0] / n_cves
ratio_after_cert = exploded_cves.loc[exploded_cves.n_days_after_certification > 0].shape[0] / n_cves
print(f"Ratio of CVEs appearing before (or exactly on) certification date: {100 * ratio_before_cert:.2f}%")
print(f"Ratio of CVEs appearing after certification date: {100 * ratio_after_cert:.2f}%")

How many CVE-rich certificates were revoked within <365 days after certification#

maybe_revoked = df_cve_rich.loc[
    ~df_cve_rich.not_valid_after.isna(),
    ["not_valid_before", "not_valid_after", "n_cves", "worst_cve_score", "avg_cve_score", "related_cves"],
]
maybe_revoked["validity_n_days"] = (maybe_revoked.not_valid_after - maybe_revoked.not_valid_before).dt.days
maybe_revoked = maybe_revoked.loc[maybe_revoked.validity_n_days < 365]
print(f"How many CVE-rich certificates were revoked in <365 days after certification: {maybe_revoked.shape[0]}")

df_w_validity_dates = df.loc[~df.not_valid_after.isna()].copy()
df_w_validity_dates.loc[:, "validity_n_days"] = (
    df_w_validity_dates.not_valid_after - df_w_validity_dates.not_valid_before
).dt.days
df_w_validity_dates = df_w_validity_dates.loc[df_w_validity_dates.validity_n_days < 365]
print(f"How many certificates were revoked in <365 days after certification: {df_w_validity_dates.shape[0]}")
print(
    f"How many certificates have expiry date same as certification date: {df_w_validity_dates.loc[df_w_validity_dates.not_valid_before == df_w_validity_dates.not_valid_after].shape[0]}"
)
df_w_validity_dates.loc[
    :, ["name", "not_valid_before", "not_valid_after", "validity_n_days", "related_cves"]
].sort_values(by="validity_n_days", ascending=False)

Scatterplot of CVE severity w.r.t. categories#

n_cves_top_20_certs = df_cve_rich.n_cves.sort_values(ascending=False).head(20)[-1]
cves = (
    df_cve_rich.loc[
        (df_cve_rich.year_from < 2022) & (df_cve_rich.n_cves < n_cves_top_20_certs) & (~df_cve_rich.eal.isna()),
        ["related_cves", "eal", "category"],
    ]
    .explode("related_cves")
    .rename(columns={"related_cves": "cve_id"})
)
cves["published_date"] = cves.cve_id.map(lambda x: cve_dset[x].published_date).dt.tz_localize(None).dt.normalize()
cves["base_score"] = cves.cve_id.map(lambda x: cve_dset[x].metrics.base_score)
cves = cves.drop_duplicates()

g = sns.relplot(
    data=cves,
    x="published_date",
    y="base_score",
    hue="eal",
    col="category",
    col_wrap=3,
    height=4,
    kind="scatter",
    facet_kws={"sharey": False, "sharex": False},
    palette="viridis",
)
g.set_titles("{col_name}")
g.set_xticklabels(rotation=60)
g.set_axis_labels("Vulnerability publish date", "Vulnerability base score")
g.tight_layout()
plt.savefig(RESULTS_DIR / "scatter_category_vs_cves.pdf", bbox_inches="tight")

Histogram of vulnerability scores per category#

cves = cves.reset_index()  # See https://github.com/mwaskom/seaborn/issues/2733
g = sns.displot(
    data=cves,
    x="base_score",
    col="category",
    col_wrap=3,
    height=3,
    kind="hist",
    facet_kws={"sharey": False, "sharex": True},
)
g.set_titles("{col_name}")
# g.set_xticklabels(rotation=60)
g.set_axis_labels("Vulnerability base score", "Frequency")
g.tight_layout()
plt.savefig(RESULTS_DIR / "hist_category_vs_cve.pdf", bbox_inches="tight")

Protection-profiles vs CVEs#

Computes Pearson’s correlation coefficient between number of CVEs and binary variable: Has a protection profile (1) or has not a protection profile (0) Beware, number of supporting certificates may be quite low

df_cve_rich["has_pp"] = df_cve_rich.protection_profiles.isna()
corrs = df_cve_rich.groupby("category")[["n_cves", "has_pp"]].corr().iloc[0::2, -1].droplevel(level=1)
corrs.name = "correlation"
support_counts = df_cve_rich.groupby("category").size()
support_counts.name = "n certs in category"
pp_rich_counts = df_cve_rich.loc[df_cve_rich.has_pp == True].groupby("category").size()
pp_rich_counts.name = "n certs with PP"
pp_n_cves_corr = pd.concat([corrs, support_counts, pp_rich_counts], axis=1)
pp_n_cves_corr

CWEs in vulnerable certificates#

  • Study distribution of CWEs in vulnerable certificates.

  • Experiment mainly with distribution across categories and security level.

  • Compare our results with findings from open-source projects.

  • Limit only to vulns. that were disclosed in the validity period of a certificate

cc_df, cwe_df = prepare_cwe_df(
    df_cves_within_validity_period.loc[df_cves_within_validity_period.n_cves > 0], cve_dset, fine_grained=True
)
cc_df
cwe_df
# Print top-25 CWEs across dataset
top_25_cwes = get_top_n_cwes(cc_df, cwe_df, n_cwes=25)
top_25_cwes
# Store top-10 CWEs into CSVs, do this also per category
CWE_DIR = RESULTS_DIR / "cwes"
CWE_DIR.mkdir(exist_ok=True)
top_25_cwes.to_csv(CWE_DIR / "top_25_cwe_overall.csv")

for cat in cc_df.category.unique():
    get_top_n_cwes(cc_df, cwe_df, category=cat).to_csv(CWE_DIR / (cat + ".csv"))

# Same for EALs
for eal in cc_df.eal.unique():
    get_top_n_cwes(cc_df, cwe_df, eal=eal).to_csv(CWE_DIR / (eal + ".csv"))
# For more coarse-grained description of CVEs, one could use so-called Security Fault Patterns (SFP) clusters.
from sec_certs.utils.pandas import SFPModel

model = SFPModel.from_web()
model.search_cwe(626)  # <-- This returns Primary Cluster and Secondary cluster

CWE evolution in time#

the following plot shows how top-10 overall CWEs evolve in time. As can be seen from the plot, the distribution is quite stable except for CWE-119. We assume that this can be caused by favoring different CWEs for similar problems, so it does not necessarilly correspond to certain weakness being eliminated recently in the coe, but even this may be the case.

cc_df.head()
cc_df["vuln_year"] = cc_df.cve_id.map(lambda x: int(x.split("-")[1]))
cc_df = cc_df.loc[cc_df.vuln_year < 2022]
top_10_cwes = set(get_top_n_cwes(cc_df, cwe_df, n_cwes=10).index.tolist())
top_10_cwes = cc_df.loc[cc_df.cwe_id.isin(top_10_cwes)]
pivot = (
    top_10_cwes.groupby(["vuln_year", "cwe_id"], as_index=False)
    .size()
    .rename(columns={"size": "frequency"})
    .pivot("vuln_year", "cwe_id", "frequency")
    .fillna(0)
)
sns.lineplot(data=pivot)
plt.xlabel("Year of vulnerability")
plt.ylabel("Number of vulnerabilities")
plt.legend(title="Weakness ID")
plt.title("CWE prevalence in time")