-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathsegment_video.py
143 lines (120 loc) · 4.85 KB
/
segment_video.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# USAGE
# python segment_video.py --model enet-cityscapes/enet-model.net --classes enet-cityscapes/enet-classes.txt --colors enet-cityscapes/enet-colors.txt --video videos/massachusetts.mp4 --output output/massachusetts_output.avi
# python segment_video.py --model enet-cityscapes/enet-model.net --classes enet-cityscapes/enet-classes.txt --colors enet-cityscapes/enet-colors.txt --video videos/toronto.mp4 --output output/toronto_output.avi
# import the necessary packages
import numpy as np
import argparse
import imutils
import time
import cv2
# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-m", "--model", required=True,
help="path to deep learning segmentation model")
ap.add_argument("-c", "--classes", required=True,
help="path to .txt file containing class labels")
ap.add_argument("-v", "--video", required=True,
help="path to input video file")
ap.add_argument("-o", "--output", required=True,
help="path to output video file")
ap.add_argument("-s", "--show", type=int, default=1,
help="whether or not to display frame to screen")
ap.add_argument("-l", "--colors", type=str,
help="path to .txt file containing colors for labels")
ap.add_argument("-w", "--width", type=int, default=500,
help="desired width (in pixels) of input image")
args = vars(ap.parse_args())
# load the class label names
CLASSES = open(args["classes"]).read().strip().split("\n")
# if a colors file was supplied, load it from disk
if args["colors"]:
COLORS = open(args["colors"]).read().strip().split("\n")
COLORS = [np.array(c.split(",")).astype("int") for c in COLORS]
COLORS = np.array(COLORS, dtype="uint8")
# otherwise, we need to randomly generate RGB colors for each class
# label
else:
# initialize a list of colors to represent each class label in
# the mask (starting with 'black' for the background/unlabeled
# regions)
np.random.seed(42)
COLORS = np.random.randint(0, 255, size=(len(CLASSES) - 1, 3),
dtype="uint8")
COLORS = np.vstack([[0, 0, 0], COLORS]).astype("uint8")
# load our serialized model from disk
print("[INFO] loading model...")
net = cv2.dnn.readNet(args["model"])
# initialize the video stream and pointer to output video file
vs = cv2.VideoCapture(args["video"])
writer = None
# try to determine the total number of frames in the video file
try:
prop = cv2.cv.CV_CAP_PROP_FRAME_COUNT if imutils.is_cv2() \
else cv2.CAP_PROP_FRAME_COUNT
total = int(vs.get(prop))
print("[INFO] {} total frames in video".format(total))
# an error occurred while trying to determine the total
# number of frames in the video file
except:
print("[INFO] could not determine # of frames in video")
total = -1
# loop over frames from the video file stream
while True:
# read the next frame from the file
(grabbed, frame) = vs.read()
# if the frame was not grabbed, then we have reached the end
# of the stream
if not grabbed:
break
# construct a blob from the frame and perform a forward pass
# using the segmentation model
frame = imutils.resize(frame, width=args["width"])
blob = cv2.dnn.blobFromImage(frame, 1 / 255.0, (1024, 512), 0,
swapRB=True, crop=False)
net.setInput(blob)
start = time.time()
output = net.forward()
end = time.time()
# infer the total number of classes along with the spatial
# dimensions of the mask image via the shape of the output array
(numClasses, height, width) = output.shape[1:4]
# our output class ID map will be num_classes x height x width in
# size, so we take the argmax to find the class label with the
# largest probability for each and every (x, y)-coordinate in the
# image
classMap = np.argmax(output[0], axis=0)
# given the class ID map, we can map each of the class IDs to its
# corresponding color
mask = COLORS[classMap]
# resize the mask such that its dimensions match the original size
# of the input frame
mask = cv2.resize(mask, (frame.shape[1], frame.shape[0]),
interpolation=cv2.INTER_NEAREST)
# perform a weighted combination of the input frame with the mask
# to form an output visualization
output = ((0.3 * frame) + (0.7 * mask)).astype("uint8")
# check if the video writer is None
if writer is None:
# initialize our video writer
fourcc = cv2.VideoWriter_fourcc(*"MJPG")
writer = cv2.VideoWriter(args["output"], fourcc, 30,
(output.shape[1], output.shape[0]), True)
# some information on processing single frame
if total > 0:
elap = (end - start)
print("[INFO] single frame took {:.4f} seconds".format(elap))
print("[INFO] estimated total time: {:.4f}".format(
elap * total))
# write the output frame to disk
writer.write(output)
# check to see if we should display the output frame to our screen
if args["show"] > 0:
cv2.imshow("Frame", output)
key = cv2.waitKey(1) & 0xFF
# if the `q` key was pressed, break from the loop
if key == ord("q"):
break
# release the file pointers
print("[INFO] cleaning up...")
writer.release()
vs.release()