Vulnerability analysis#

from sec_certs.dataset.fips import FIPSDataset
from sec_certs.dataset.cpe import CPEDataset
from sec_certs.dataset.cve import CVEDataset
from sec_certs.utils.pandas import expand_df_with_cve_cols
import pandas as pd
import seaborn as sns
import itertools
import functools
import matplotlib.pyplot as plt
from scipy import stats

dset = FIPSDataset.from_web_latest(path="dset", auxiliary_datasets=True)

cve_dset: CVEDataset = dset.auxiliary_datasets.cve_dset
cpe_dset: CPEDataset = dset.auxiliary_datasets.cpe_dset

df = dset.to_pandas()
cves = list(itertools.chain.from_iterable(x.heuristics.related_cves for x in dset if x.heuristics.related_cves))
cve_dict = {x: cve_dset[x] for x in cves}
cve_dset.cves = cve_dict # Limit cve_dset to CVEs relevant to some certificate
df = expand_df_with_cve_cols(df, cve_dset)

df_cpe_rich = df.loc[~df.cpe_matches.isnull()].copy()
df_cve_rich = df.loc[df.related_cves.notnull()].copy()

df.head()

len(df_cve_rich)

len(df_cpe_rich)

categories_cpe = df_cpe_rich.type.value_counts().sort_index().rename('Type distribution CPE-rich')
categories_cve = df_cve_rich.type.value_counts().sort_index().rename('Type distribution CVE-rich')
categories_all = df.type.value_counts().sort_index().rename('Type distribution all')

categories_merged = pd.concat([categories_all, categories_cpe, categories_cve], axis=1)
categories_merged = categories_merged.div(categories_merged.sum(axis=0), axis=1)

categories_merged.plot.bar(title='Type comparison between CPE-rich, CVE-rich and all certificates');

years_cpe = df_cpe_rich.year_from.value_counts().sort_index().rename('Year distribution CPE-rich')
years_cve = df_cve_rich.year_from.value_counts().sort_index().rename('Year distribution CVE-rich')
years_all = df.year_from.value_counts().sort_index().rename('Year distribution all certificates')

years_merged = pd.concat([years_all, years_cpe, years_cve], axis=1)
years_merged.index.name = "year_from"
years_merged = years_merged.loc[years_merged.index < 2022]
years_merged = years_merged.div(years_merged.sum(axis=0), axis=1)
years_merged.plot.line(title='Years comparision between CPE-rich, CVE-rich and all certificates');

levels_cpe = df_cpe_rich.level.value_counts().sort_index().rename('Level distribution CPE-rich')
levels_cve = df_cve_rich.level.value_counts().sort_index().rename('Level distribution CVE-rich')
levels_all = df.level.value_counts().sort_index().rename('Level distribution all certificates')

levels_merged = pd.concat([levels_all, levels_cpe, levels_cve], axis=1)
levels_merged = levels_merged.div(levels_merged.sum(axis=0), axis=1)
levels_merged.plot.bar(title='EAL comparision between CPE-rich, CVE-rich and all certificates');

spearmanr = functools.partial(stats.spearmanr, nan_policy="omit", alternative="less")
n_cves_level_corr, n_cves_level_pvalue = spearmanr(df_cve_rich.level, df_cve_rich.n_cves)
print(n_cves_level_corr, n_cves_level_pvalue)

worst_cve_level_corr, worst_cve_level_pvalue = spearmanr(df_cve_rich.level, df_cve_rich.worst_cve_score)
print(worst_cve_level_corr, worst_cve_level_pvalue)

avg_cve_level_corr, avg_cve_level_pvalue = spearmanr(df_cve_rich.level, df_cve_rich.avg_cve_score)
print(avg_cve_level_corr, avg_cve_level_pvalue)

g = sns.relplot(data=df_cve_rich, x="level", y="n_cves")
plt.show()
g = sns.relplot(data=df_cve_rich, x="level", y="worst_cve_score")
plt.show()
g = sns.relplot(data=df_cve_rich, x="level", y="avg_cve_score")
plt.show()