Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove watermark mapper #236

Merged
merged 8 commits into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -148,11 +148,14 @@ process:
frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
horizontal_flip: false # flip frame image horizontally (left to right).
vertical_flip: false # flip frame image vertically (top to bottom).
- video_split_by_scene_mapper: # split videos into scene clips
detector: 'ContentDetector' # PySceneDetect scene detector. Should be one of ['ContentDetector', 'ThresholdDetector', 'AdaptiveDetector`]
threshold: 27.0 # threshold passed to the detector
min_scene_len: 15 # minimum length of any scene
show_progress: false # whether to show progress from scenedetect
- video_ffmpeg_wrapped_mapper: # simple wrapper for FFmpeg video filters
- video_remove_watermark_mapper: # Remove the watermarks in videos given regions
roi_strings: ['0,0,0.1,0.1'] # a given list of regions the watermarks locate. The format of each can be "x1, y1, x2, y2", "(x1, y1, x2, y2)", or "[x1, y1, x2, y2]".
roi_type: ratio # the roi string type. When the type is 'pixel', (x1, y1), (x2, y2) are the locations of pixels in the top left corner and the bottom right corner respectively. If the roi_type is 'ratio', the coordinates are normalized by wights and heights.
roi_key: null # the key name of fields in samples to store roi_strings for each sample. It's used for set different rois for different samples.
frame_num: 10 # the number of frames to be extracted uniformly from the video to detect the pixels of watermark.
min_frame_threshold: 7 # a coodination is considered as the location of a watermark pixel when it is a watermark pixel in no less min_frame_threshold frames.
detection_method: pixel_value # the method to detect the pixels of watermark. If it is 'pixel_value', we consider the distribution of pixel value in each frame. If it is 'pixel_diversity', we will consider the pixel diversity in different frames.
- video_resize_aspect_ratio_mapper: # resize videos aspect ratios of videos (a fraction of width by height, r=w/h) to a specified range
min_ratio: 9/21 # the minimum aspect ratio to enforce videos with an aspect ratio below `min_ratio` will be resized to match this minimum ratio. The ratio should be provided as a string in the format "9:21" or "9/21".
max_ratio: 21/9 # the maximum aspect ratio to enforce videos with an aspect ratio above `max_ratio` will be resized to match this maximum ratio. The ratio should be provided as a string in the format "21:9" or "21/9".
Expand All @@ -164,13 +167,17 @@ process:
max_height: 1080 # the max vertical resolution (unit p), videos with height more than 'max_height' will be mapped to videos with equal or smaller height
force_original_aspect_ratio: 'increase' # Enable decreasing or increasing output video width or height if necessary to keep the original aspect ratio
force_divisible_by: 4 # Ensures that both the output dimensions, width and height, are divisible by the given integer when used together with force_original_aspect_ratio
- video_ffmpeg_wrapped_mapper: # simple wrapper for FFmpeg video filters
- video_split_by_duration_mapper: # Mapper to split video by duration.
split_duration: 10 # duration of each video split in seconds.
min_last_split_duration: 0.1 # the minimum allowable duration in seconds for the last video split. If the duration of the last split is less than this value, it will be discarded.
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only cut sample in the final datasets and the original sample will be removed. It's True in default
- video_split_by_key_frame_mapper: # Mapper to split video by key frame.
keep_original_sample: true # whether to keep the original sample. If it's set to False, there will be only cut sample in the final datasets and the original sample will be removed. It's True in default
- video_split_by_scene_mapper: # split videos into scene clips
detector: 'ContentDetector' # PySceneDetect scene detector. Should be one of ['ContentDetector', 'ThresholdDetector', 'AdaptiveDetector`]
threshold: 27.0 # threshold passed to the detector
min_scene_len: 15 # minimum length of any scene
show_progress: false # whether to show progress from scenedetect
- video_tagging_from_audio_mapper: # Mapper to generate video tags from audio streams extracted from the video.
hf_ast: 'MIT/ast-finetuned-audioset-10-10-0.4593' # Huggingface model name for the audio classification model.
- video_tagging_from_frames_mapper: # Mapper to generate video tags from frames extracted from the video.
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
replace_content_mapper, sentence_split_mapper,
video_captioning_from_audio_mapper,
video_captioning_from_video_mapper, video_ffmpeg_wrapped_mapper,
video_resize_aspect_ratio_mapper,
video_remove_watermark_mapper, video_resize_aspect_ratio_mapper,
video_resize_resolution_mapper, video_split_by_duration_mapper,
video_split_by_key_frame_mapper, video_split_by_scene_mapper,
video_tagging_from_audio_mapper,
Expand Down
229 changes: 229 additions & 0 deletions data_juicer/ops/mapper/video_remove_watermark_mapper.py
BeachWang marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
import os

import av
import numpy as np
from jsonargparse.typing import List, PositiveInt

from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.file_utils import transfer_filename
from data_juicer.utils.logger_utils import HiddenPrints
from data_juicer.utils.mm_utils import (extract_video_frames_uniformly,
load_data_with_context, load_video,
parse_string_to_roi,
process_each_frame)

from ..base_op import OPERATORS, Mapper
from ..op_fusion import LOADED_VIDEOS

OP_NAME = 'video_remove_watermark_mapper'

with AvailabilityChecking(['opencv-python'], OP_NAME), HiddenPrints():
import cv2 as cv


@OPERATORS.register_module(OP_NAME)
@LOADED_VIDEOS.register_module(OP_NAME)
class VideoRemoveWatermarkMapper(Mapper):
"""
Remove the watermarks in videos given regions.
"""

def __init__(self,
roi_strings: List[str] = ['0,0,0.1,0.1'],
roi_type: str = 'ratio',
roi_key: str = None,
frame_num: PositiveInt = 10,
min_frame_threshold: PositiveInt = 7,
detection_method: str = 'pixel_value',
threshold: int = None,
*args,
**kwargs):
"""
Initialization method.

:param roi_strings: a given list of regions the watermarks locate.
The format of each can be "x1, y1, x2, y2", "(x1, y1, x2, y2)",
or "[x1, y1, x2, y2]".
:param roi_type: the roi string type. When the type is 'pixel', (x1,
y1), (x2, y2) are the locations of pixels in the top left corner
and the bottom right corner respectively. If the roi_type is
'ratio', the coordinates are normalized by wights and heights.
:param roi_key: the key name of fields in samples to store roi_strings
for each sample. It's used for set different rois for different
samples. If it's none, use rois in parameter "roi_strings".
It's None in default.
:param frame_num: the number of frames to be extracted uniformly from
the video to detect the pixels of watermark.
:param min_frame_threshold: a coodination is considered as the
location of a watermark pixel when it is that in no less
min_frame_threshold frames.
:param detection_method: the method to detect the pixels of watermark.
If it is 'pixel_value', we consider the distribution of pixel
value in each frame. If it is 'pixel_diversity', we will consider
the pixel diversity in different frames. The min_frame_threshold
is useless and frame_num must be greater than 1 in
'pixel_diversity' mode.
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
self._init_parameters = self.remove_extra_parameters(locals())

if roi_type not in ['ratio', 'pixel']:
raise ValueError(f'roi_type [{roi_type}]'
f' is not supported. '
f"Can only be one of ['ratio', 'pixel']. ")

if detection_method not in ['pixel_value', 'pixel_diversity']:
raise ValueError(
f'etection_method [{detection_method}]'
f' is not supported. '
f"Can only be one of ['pixel_value', 'pixel_diversity']. ")

if detection_method == 'pixel_diversity' and frame_num < 2:
raise ValueError(
"frame_num must be gteater than 1 in 'pixel_diversity' mode.")

rois = []
if roi_key is None:
for roi_string in roi_strings:
roi = parse_string_to_roi(roi_string, roi_type)
if roi is None:
raise ValueError(
'The roi in roi_strings must be four no negative'
' numbers in the format of "x1, y1, x2, y2", '
'"(x1, y1, x2, y2)", or "[x1, y1, x2, y2]".')
rois.append(roi)

self.roi_type = roi_type
self.rois = rois
self.roi_key = roi_key
self.frame_num = frame_num
self.min_frame_threshold = min_frame_threshold
self.detection_method = detection_method

def _detect_watermark_via_pixel_value(self, frames, rois):

masks = []
for frame in frames:
frame = frame.to_ndarray(format='bgr24')
mask = np.zeros_like(frame[:, :, 0], dtype=np.uint8)
for roi in rois:
# dimension of ndarray frame: height x width x channel
roi_frame = frame[roi[1]:roi[3], roi[0]:roi[2]]
gray_frame = cv.cvtColor(roi_frame, cv.COLOR_BGR2GRAY)
_, binary_frame = cv.threshold(
gray_frame, 0, 255, cv.THRESH_BINARY + cv.THRESH_OTSU)

# assume the watermark is located in the box, so the pixel in
# the edge must be 0, if not, reverse binary_frame
edge_postive_num = (binary_frame[0] >
0).sum() + (binary_frame[:, 0] > 0).sum()
total = binary_frame.shape[0] + binary_frame.shape[1]
if edge_postive_num * 2 > total:
binary_frame = ~binary_frame

mask[roi[1]:roi[3],
roi[0]:roi[2]] = mask[roi[1]:roi[3],
roi[0]:roi[2]] | binary_frame
masks.append(mask)
final_mask = sum((mask == 255).astype(np.uint8) for mask in masks)
final_mask = np.where(final_mask >= self.min_frame_threshold, 255, 0)
final_mask = final_mask.astype(np.uint8)
return final_mask

def _detect_watermark_via_pixel_diversity(self, frames, rois):

mask = np.zeros((frames[0].height, frames[0].width), dtype=np.uint8)
frames = [frame.to_ndarray(format='bgr24') for frame in frames]

for roi in rois:
roi_frames = [
frame[roi[1]:roi[3], roi[0]:roi[2]] for frame in frames
]
roi_frames = np.stack(roi_frames, axis=0)
pixel_diversity = roi_frames.std(axis=0)
pixel_diversity = pixel_diversity.sum(-1)
max_diversity = np.max(pixel_diversity)
min_diversity = np.min(pixel_diversity)
if max_diversity > min_diversity:
scaled_diversity = 255 * (pixel_diversity - min_diversity) / (
max_diversity - min_diversity)
else:
scaled_diversity = np.zeros_like(pixel_diversity)
scaled_diversity = scaled_diversity.astype(np.uint8)
_, binary_frame = cv.threshold(scaled_diversity, 0, 255,
cv.THRESH_BINARY + cv.THRESH_OTSU)
# the watermark pixels have less diversity
binary_frame = ~binary_frame
mask[roi[1]:roi[3],
roi[0]:roi[2]] = mask[roi[1]:roi[3],
roi[0]:roi[2]] | binary_frame

return mask

def _generate_watermark_mask(self, video, sample):
frames = extract_video_frames_uniformly(video, self.frame_num)

if self.roi_key is not None:
roi_strings = sample[self.roi_key]
if isinstance(roi_strings, str):
roi_strings = [roi_strings]
rois = [
parse_string_to_roi(roi_string, self.roi_type)
for roi_string in roi_strings
]
rois = [roi for roi in rois if roi is not None]
else:
rois = self.rois
if self.roi_type == 'ratio':
rois = [
tuple([
int(roi[0] * frames[0].width),
int(roi[1] * frames[0].height),
int(roi[2] * frames[0].width),
int(roi[3] * frames[0].height)
]) for roi in self.rois
]

if self.detection_method == 'pixel_value':
mask = self._detect_watermark_via_pixel_value(frames, rois)
else:
mask = self._detect_watermark_via_pixel_diversity(frames, rois)

kernel = np.ones((5, 5), np.uint8)
return cv.dilate(mask, kernel)

def _clean_watermark(self, frame, watermark_mask):
np_frame = frame.to_ndarray(format='bgr24')
new_np_frame = cv.inpaint(np_frame, watermark_mask, 3, cv.INPAINT_NS)
return av.VideoFrame.from_ndarray(new_np_frame, format='bgr24')

def process(self, sample, context=False):
# there is no video in this sample
if self.video_key not in sample or not sample[self.video_key]:
return sample

loaded_video_keys = sample[self.video_key]
sample, videos = load_data_with_context(sample, context,
loaded_video_keys, load_video)

for index, video_key in enumerate(loaded_video_keys):
video = videos[video_key]
cleaned_video_key = transfer_filename(video_key, OP_NAME,
**self._init_parameters)

if (not os.path.exists(cleaned_video_key)
or cleaned_video_key not in loaded_video_keys):
watermark_mask = self._generate_watermark_mask(video, sample)

def process_frame_func(frame):
return self._clean_watermark(frame, watermark_mask)

process_each_frame(video, cleaned_video_key,
process_frame_func)

loaded_video_keys[index] = cleaned_video_key

sample[self.video_key] = loaded_video_keys
return sample
5 changes: 2 additions & 3 deletions data_juicer/ops/mapper/video_split_by_duration_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,8 @@ def split_videos_by_duration(self, video_key, container):

def _process_single_sample(self, sample):
# there is no video in this sample
if self.video_key not in sample \
or sample[self.video_key] is None \
or len(sample[self.video_key]) == 0:
if self.video_key not in sample or sample[
self.video_key] is None or len(sample[self.video_key]) == 0:
return []

# the split results
Expand Down
5 changes: 2 additions & 3 deletions data_juicer/ops/mapper/video_split_by_key_frame_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,8 @@ def get_split_key_frame(self, video_key, container):

def _process_single_sample(self, sample):
# there is no video in this sample
if self.video_key not in sample \
or sample[self.video_key] is None \
or len(sample[self.video_key]) == 0:
if self.video_key not in sample or sample[
self.video_key] is None or len(sample[self.video_key]) == 0:
return []

# the split results
Expand Down
Loading
Loading