Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft: Scrape FIPS algorithm data #276 #409

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 60 additions & 9 deletions src/sec_certs/dataset/fips_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,27 +87,78 @@ def download_alg_list_htmls(output_dir: Path) -> list[Path]:

return paths

@staticmethod
def download_algs_data(output_dir: Path, alg_links: list[str]) -> list[Path]:
urls = [constants.FIPS_CAVP_URL + "/" + i for i in alg_links]
paths = [output_dir / f"alg_page{i}.html" for i in range(0, len(alg_links))]
responses = helpers.download_parallel(urls, paths, progress_bar_desc="Downloading FIPS Algorithm data")

failed_tuples = [
(url, path) for url, path, resp in zip(urls, paths, responses) if resp != constants.RESPONSE_OK
]
if failed_tuples:
failed_urls, failed_paths = zip(*failed_tuples)
responses = helpers.download_parallel(failed_urls, failed_paths)
if any(x != constants.RESPONSE_OK for x in responses):
raise ValueError("Failed to download the algorithms data, the dataset won't be constructed.")

return paths

@staticmethod
def get_number_of_html_pages(html_path: Path) -> int:
with html_path.open("r") as handle:
soup = BeautifulSoup(handle, "html5lib")
return int(soup.select("span[data-total-pages]")[0].attrs["data-total-pages"])

@staticmethod
def parse_alg_data_from_html(html_path: Path) -> tuple[str, str, str, str]:
fields = []
with html_path.open("r") as handle:
soup = BeautifulSoup(handle, "html5lib")
for field in ["Description", "Version", "Type"]:
div = soup.find("div", text=field)
fields.append("" if div is None else div.find_next_sibling("div").get_text())
capability_trs = soup.find("table").find("tbody").findAll("tr")
capabilities = [c.findAll("td")[1].find(["b", "s"]).get_text().strip() for c in capability_trs]
return fields[0], fields[1], fields[2], ", ".join(capabilities)

@staticmethod
def parse_algorithms_from_html(html_path: Path) -> set[FIPSAlgorithm]:
df = pd.read_html(html_path)[0]
df = pd.read_html(html_path, extract_links="body")[0]
for col in df.columns:
if "Order by" in col:
df.rename(columns={col: col.split("Order by")[0]}, inplace=True)
df["alg_type"] = df["Validation Number"].map(lambda x: re.sub(r"[0-9\s]", "", x))
df["alg_number"] = df["Validation Number"].map(lambda x: re.sub(r"[^0-9]", "", x))
df["alg"] = df.apply(
lambda row: FIPSAlgorithm(
row["alg_number"], row["alg_type"], row["Vendor"], row["Implementation"], row["Validation Date"]
),
axis=1,
df = df.assign(
alg_type=df["Validation Number"].map(lambda x: re.sub(r"[0-9\s]", "", x[0])),
alg_number=df["Validation Number"].map(lambda x: re.sub(r"[^0-9]", "", x[0])),
Vendor=df["Vendor"].map(lambda x: x[0]),
Implementation=df["Implementation"].map(lambda x: x[0]),
Validation_Date=df["Validation Date"].map(lambda x: x[0])
)
links = [x[1] for x in df["Validation Number"]]

with TemporaryDirectory() as tmp_dir:
alg_pages = FIPSAlgorithmDataset.download_algs_data(Path(tmp_dir), links)
parsed_data = [FIPSAlgorithmDataset.parse_alg_data_from_html(page) for page in alg_pages]
descriptions, versions, types, capabilities = zip(*parsed_data)
df = df.assign(description=descriptions, version=versions, type=types, algorithm_capabilities=capabilities)

return set(
df.apply(
lambda row: FIPSAlgorithm(
row["alg_number"],
row["alg_type"],
row["Vendor"],
row["Implementation"],
row["Validation Date"],
row["description"],
row["version"],
row["type"],
row["algorithm_capabilities"],
),
axis=1,
)
)
return set(df["alg"])

def to_pandas(self) -> pd.DataFrame:
return pd.DataFrame([x.pandas_tuple for x in self], columns=FIPSAlgorithm.pandas_columns).set_index("dgst")
Expand Down
13 changes: 13 additions & 0 deletions src/sec_certs/sample/fips_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,22 @@ class FIPSAlgorithm(PandasSerializableType, ComplexSerializableType):
implementation_name: str
validation_date: date

description: str
version: str
type: str
algorithm_capabilities: str

pandas_columns: ClassVar[list[str]] = [
"dgst",
"alg_number",
"algorithm_type",
"vendor",
"implementation_name",
"validation_date",
"description",
"version",
"type",
"algorithm_capabilities",
]

@property
Expand All @@ -39,6 +48,10 @@ def pandas_tuple(self) -> tuple:
self.vendor,
self.implementation_name,
self.validation_date,
self.description,
self.version,
self.type,
self.algorithm_capabilities,
)

@property
Expand Down
11 changes: 11 additions & 0 deletions tests/fips/test_fips_algorithm_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,17 @@ def alg_dict() -> dict[str, Any]:
"vendor": "Hewlett-Packard Development Company, L.P.",
"implementation_name": "HP Secure Encryption Engine v1.0",
"validation_date": "7/10/2014",
"description": "HP Secure Encryption is a controller-based data "
"encryption solution for HP ProLiant Gen8 or newer "
"servers that protects data at rest on any bulk storage"
" attached to the HP Smart Array controller. The "
"solution comprises our 12G family of HP Smart Array "
"controllers, the HP Physical Security Kit, and the HP "
"Secure Encryption licensing.",
"version": "PM8061",
"type": "HARDWARE",
"algorithm_capabilities": "HMAC-SHA2-256, Counter DRBG, "
"AES-ECB, AES-XTS, SHA2-256",
}


Expand Down