tensorflowで物体認識(YOLOv2)をやってみる[動画解析編]
前回はYolov2の準備をして画像から物体認識をさせました。
今回は動画を読み込み物体認識をさせようと思います。
また、解析した動画は保存するようにしました。
[結果]
まずは結果から
↓切り抜き画像↓
[処理]
元のソースは画像系をPILで処理していました。
今回は動画読み込みや保存をOpencvで行いましたので、
Opencv -> PIL -> Yolov2 -> PIL -> Opencv
といった流れで処理させています。
[test_yolo_video.py] #! /usr/bin/env python # python test_yolo.py model_data/yolo.h5 """Run a YOLO_v2 style detection model on test images.""" import argparse import colorsys import imghdr import os import random import numpy as np from keras import backend as K from keras.models import load_model from PIL import Image, ImageDraw, ImageFont from yad2k.models.keras_yolo import yolo_eval, yolo_head import cv2 input_width, input_height = (416, 416) parser = argparse.ArgumentParser( description='Run a YOLO_v2 style detection model on test images..') parser.add_argument( 'model_path', help='path to h5 model file containing body' 'of a YOLO_v2 model') parser.add_argument( '-a', '--anchors_path', help='path to anchors file, defaults to yolo_anchors.txt', default='model_data/yolo_anchors.txt') parser.add_argument( '-c', '--classes_path', help='path to classes file, defaults to coco_classes.txt', default='model_data/coco_classes.txt') parser.add_argument( '-t', '--test_path', help='path to directory of test images, defaults to images/', default='images') parser.add_argument( '-o', '--output_path', help='path to output test images, defaults to images/out', default='images/out') parser.add_argument( '-s', '--score_threshold', type=float, help='threshold for bounding box scores, default .3', default=.3) parser.add_argument( '-iou', '--iou_threshold', type=float, help='threshold for non max suppression IOU, default .5', default=.5) def _main(args): model_path = os.path.expanduser(args.model_path) assert model_path.endswith('.h5'), 'Keras model must be a .h5 file.' anchors_path = os.path.expanduser(args.anchors_path) classes_path = os.path.expanduser(args.classes_path) test_path = os.path.expanduser(args.test_path) output_path = os.path.expanduser(args.output_path) sess = K.get_session() # TODO: Remove dependence on Tensorflow session. with open(classes_path) as f: class_names = f.readlines() class_names = [c.strip() for c in class_names] with open(anchors_path) as f: anchors = f.readline() anchors = [float(x) for x in anchors.split(',')] anchors = np.array(anchors).reshape(-1, 2) yolo_model = load_model(model_path) # Verify model, anchors, and classes are compatible num_classes = len(class_names) num_anchors = len(anchors) # TODO: Assumes dim ordering is channel last model_output_channels = yolo_model.layers[-1].output_shape[-1] assert model_output_channels == num_anchors * (num_classes + 5), \ 'Mismatch between model and given anchor and class sizes. ' \ 'Specify matching anchors and classes with --anchors_path and ' \ '--classes_path flags.' print('{} model, anchors, and classes loaded.'.format(model_path)) # Check if model is fully convolutional, assuming channel last order. model_image_size = yolo_model.layers[0].input_shape[1:3] is_fixed_size = model_image_size != (None, None) # Generate colors for drawing bounding boxes. hsv_tuples = [(x / len(class_names), 1., 1.) for x in range(len(class_names))] colors = list(map(lambda x: colorsys.hsv_to_rgb(*x), hsv_tuples)) colors = list( map(lambda x: (int(x[0] * 255), int(x[1] * 255), int(x[2] * 255)), colors)) random.seed(10101) # Fixed seed for consistent colors across runs. random.shuffle(colors) # Shuffle colors to decorrelate adjacent classes. random.seed(None) # Reset seed to default. # Generate output tensor targets for filtered bounding boxes. # TODO: Wrap these backend operations with Keras layers. yolo_outputs = yolo_head(yolo_model.output, anchors, len(class_names)) input_image_shape = K.placeholder(shape=(2, )) boxes, scores, classes = yolo_eval( yolo_outputs, input_image_shape, score_threshold=args.score_threshold, iou_threshold=args.iou_threshold) #video系準備 cap = cv2.VideoCapture("input.mp4") # Define the codec and create VideoWriter object fourcc = cv2.VideoWriter_fourcc(*'MJPG') ret, frame = cap.read() h, w, ch = frame.shape output_video = cv2.VideoWriter('output.avi',fourcc, 30.0, (w,h)) while(cap.isOpened()): # フレームを取得 ret, frame = cap.read() #BGRからRGBへ変換 cv_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) #PILに変換 src_image_pil=Image.fromarray(cv_rgb) pil_normalize = src_image_pil.convert('RGB') image = pil_normalize if is_fixed_size: # TODO: When resizing we can use minibatch input. resized_image = image.resize( tuple(reversed(model_image_size)), Image.BICUBIC) image_data = np.array(resized_image, dtype='float32') else: # Due to skip connection + max pooling in YOLO_v2, inputs must have # width and height as multiples of 32. new_image_size = (image.width - (image.width % 32), image.height - (image.height % 32)) resized_image = image.resize(new_image_size, Image.BICUBIC) image_data = np.array(resized_image, dtype='float32') # print(image_data.shape) image_data /= 255. image_data = np.expand_dims(image_data, 0) # Add batch dimension. out_boxes, out_scores, out_classes = sess.run( [boxes, scores, classes], feed_dict={ yolo_model.input: image_data, input_image_shape: [image.size[1], image.size[0]], K.learning_phase(): 0 }) #print('Found {} boxes for {}'.format(len(out_boxes), image_file)) font = ImageFont.truetype( font='font/FiraMono-Medium.otf', size=np.floor(3e-2 * image.size[1] + 0.5).astype('int32')) thickness = (image.size[0] + image.size[1]) // 300 for i, c in reversed(list(enumerate(out_classes))): predicted_class = class_names[c] box = out_boxes[i] score = out_scores[i] label = '{} {:.2f}'.format(predicted_class, score) draw = ImageDraw.Draw(image) label_size = draw.textsize(label, font) top, left, bottom, right = box top = max(0, np.floor(top + 0.5).astype('int32')) left = max(0, np.floor(left + 0.5).astype('int32')) bottom = min(image.size[1], np.floor(bottom + 0.5).astype('int32')) right = min(image.size[0], np.floor(right + 0.5).astype('int32')) #print(label, (left, top), (right, bottom)) if top - label_size[1] >= 0: text_origin = np.array([left, top - label_size[1]]) else: text_origin = np.array([left, top + 1]) # My kingdom for a good redistributable image drawing library. for i in range(thickness): draw.rectangle( [left + i, top + i, right - i, bottom - i], outline=colors[c]) draw.rectangle( [tuple(text_origin), tuple(text_origin + label_size)], fill=colors[c]) draw.text(text_origin, label, fill=(0, 0, 0), font=font) del draw #cvに変換して表示&保存 cv_output=np.asarray(image) cv_output = cv2.cvtColor(cv_output, cv2.COLOR_BGR2RGB) cv2.imshow("view", cv_output) output_video.write(cv_output) # qキーが押されたら途中終了 if cv2.waitKey(5) & 0xFF == ord('q'): break cap.release() output_video.release() cv2.destroyAllWindows() sess.close() if __name__ == '__main__': _main(parser.parse_args())
読み込む動画データをソース上に直接書いてしまっているのですが、
同じフォルダにinput.mp4を置いてください。
(あとでコマンドラインの引き数に書き直す予定です)
実行する
python test_yolo_video.py model_data/yolo.h5
おわり