-
Notifications
You must be signed in to change notification settings - Fork 197
/
Copy pathgeneral-video-refine-example.yaml
65 lines (61 loc) · 9.28 KB
/
general-video-refine-example.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# Process config example including:
# - all global arguments
# - all ops and their arguments
# global parameters
project_name: 'all' # project name for distinguish your configs
dataset_path: '/path/to/a/video-text/dataset.jsonl'
# accepted format: 'weight1(optional) dataset1-path weight2(optional) dataset2-path'
export_path: '/path/to/store/refined/dataset.jsonl'
np: 48 # number of subprocess to process your dataset
# Note: currently, we support specify only ONE key for each op, for cases requiring multiple keys, users can specify the op multiple times. We will only use the first key of `text_keys` when you set multiple keys.
open_tracer: true # whether to open the tracer to trace the changes during process. It might take more time when opening tracer
# for multimodal data processing
video_key: 'videos' # key name of field to store the list of sample video paths.
video_special_token: '<__dj__video>' # the special token that represents a video in the text. In default, it's "<__dj__video>". You can specify your own special token according to your input dataset.
eoc_special_token: '<|__dj__eoc|>' # the special token that represents the end of a chunk in the text. In default, it's "<|__dj__eoc|>". You can specify your own special token according to your input dataset.
# process schedule: a list of several process operators with their arguments
# hyperparameters are set according to the 3-sigma stats on MSR-VTT dataset
process:
- language_id_score_filter: # filter text in specific language with language scores larger than a specific max value
lang: en # keep text in what language
min_score: 0.26311219 # the min language scores to filter text
- perplexity_filter: # filter text with perplexity score out of specific range
lang: en # compute perplexity in what language
max_ppl: 7376.81378 # the max perplexity score to filter text
- video_aesthetics_filter: # filter samples according to the aesthetics score of frame images extracted from videos.
hf_scorer_model: shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE # Huggingface model name for the aesthetics predictor
min_score: 0.31767486 # the min aesthetics score of filter range
max_score: 1.0 # the max aesthetics score of filter range
frame_sampling_method: 'uniform' # sampling method of extracting frame images from the videos. Should be one of ["all_keyframe", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "uniform" with frame_num=3, considering that the number of keyframes can be large while their difference is usually small in terms of their aesthetics.
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
reduce_mode: avg # reduce mode to the all frames extracted from videos, must be one of ['avg','max', 'min'].
any_or_all: any # keep this sample when any/all images meet the filter condition
- video_frames_text_similarity_filter: # keep samples those similarities between sampled video frame images and text within a specific range.
hf_clip: openai/clip-vit-base-patch32 # clip model name on huggingface to compute the similarity between frame image and text. It's kind of language-related. For example, for Chinese datasets, ChineseCLIP might be a better choice.
min_score: 0.16571071 # the min similarity to keep samples.
max_score: 1.0 # the max similarity to keep samples.
frame_sampling_method: all_keyframes # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
horizontal_flip: false # flip frame image horizontally (left to right).
vertical_flip: false # flip frame image vertically (top to bottom).
reduce_mode: avg # reduce mode when one text corresponds to multiple videos in a chunk, must be one of ['avg','max', 'min'].
any_or_all: any # keep this sample when any/all videos meet the filter condition
- video_motion_score_filter: # Keep samples with video motion scores within a specific range.
min_score: 0.25 # the minimum motion score to keep samples
max_score: 10000.0 # the maximum motion score to keep samples
sampling_fps: 2 # the sampling rate of frames_per_second to compute optical flow
any_or_all: any # keep this sample when any/all videos meet the filter condition
- video_nsfw_filter: # filter samples according to the nsfw scores of videos in them
hf_nsfw_model: Falconsai/nsfw_image_detection # Huggingface model name for nsfw classification
score_threshold: 0.34847191 # the nsfw score threshold for samples, range from 0 to 1
frame_sampling_method: all_keyframes # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
reduce_mode: avg # reduce mode for multiple sampled video frames to compute nsfw scores of videos, must be one of ['avg','max', 'min'].
any_or_all: any # keep this sample when any/all images meet the filter condition
- video_watermark_filter: # filter samples according to the predicted watermark probabilities of videos in them
hf_watermark_model: amrul-hzz/watermark_detector # Huggingface model name for watermark classification
prob_threshold: 0.96510297 # the predicted watermark probability threshold for samples, range from 0 to 1
frame_sampling_method: all_keyframes # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
reduce_mode: avg # reduce mode for multiple sampled video frames to compute final predicted watermark probabilities of videos, must be one of ['avg','max', 'min'].
any_or_all: any # keep this sample when any/all images meet the filter condition