-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_summarizer.py
86 lines (76 loc) · 3.2 KB
/
data_summarizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import pandas as pd
import logging
import json
import base64
from bs4 import BeautifulSoup
import argparse
# Init logging configuration
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='/tmp/test.log', mode='a')
formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.INFO)
logging.info(f'Logger inited.')
# Define the function to extract useful text from har file
def extract_text_from_har(har_path: str) -> str:
"""
Extract useful text from a har file
:param har_path: str, the path of the har file
:return: str, the extracted text
"""
try:
with open(har_path, "r") as f:
har = json.loads(f.read())
for entry in har["log"]["entries"]:
content = entry["response"]["content"]
if content.get("mimeType", "NoMIMEType")[:9] == "text/html" and "text" in content.keys():
html_doc = base64.b64decode(content["text"])
soup = BeautifulSoup(html_doc, "html.parser")
return " ".join([str(s) for s in soup.strippped_strings])
return ""
except Exception as e:
logging.warning(f"Failed to extract text from {har_path}: {e}")
return ""
# Define the function to summarize the data into a csv file
def summarize_data(data_dir: str, output_csv: str) -> None:
"""
Summarize the data in the data directory and save the summary to a csv file
:param data_dir: str, the directory of the data, generally, the directory contains multiple snapshots
:param output_csv: str, the output csv file path
the csv file contains the columns "domain", "image" and "text"
"""
# Get the list of snapshots
# A snapshot is a directory containing a screenshot (.png) and a network track file (.har)
snapshots = os.listdir(data_dir)
snapshots = [snapshot for snapshot in snapshots if os.path.isdir(os.path.join(data_dir, snapshot))]
# Initialize the summary dataframe
summary_df = pd.DataFrame(columns=["domain", "image", "text"])
# Iterate through the snapshots
for snapshot in snapshots:
snapshot_dir = os.path.join(data_dir, snapshot)
screenshot_path = os.path.join(snapshot_dir, "page_screenshot.png")
har_path = os.path.join(snapshot_dir, "test.har")
text = extract_text_from_har(har_path)
summary_df = pd.concat([
summary_df,
pd.DataFrame({
"domain": [snapshot],
"image": [screenshot_path],
"text": [text]
})
])
# Save the summary dataframe to a csv file
summary_df.to_csv(output_csv, index=False)
logging.info(f"Data summarized and saved to {output_csv}.")
if __name__ == "__main__":
# argparse
parser = argparse.ArgumentParser()
# data_dir containing the snapshots
# e.g., data_dir = /tmp/pfs/screenshots/20240101/ngrok/
parser.add_argument("--data_dir", type=str, help="The directory of the data")
parser.add_argument("--output_csv", type=str, help="The output csv file path")
args = parser.parse_args()
# Summarize the data
summarize_data(args.data_dir, args.output_csv)