模拟相机拍照——对文档进行数据增强

一. 背景

假如我们有一个标准文件，我们对其进行文字识别、版面分析或者其他下游任务就比较容易。然而，当图片是手机拍照获取的，图片中往往有阴影、摩尔纹、弯曲。
那么，如何通过标准的文档，获得类似相机拍照的图片呢？
这里介绍的就是文档数据增强，用标准文档模拟相机拍照场景。该方法不仅能用于文档各场景的数据增强，用于OCR检测识别等任务；还能合成各种图片训练对，用于文档去阴影、文档去摩尔纹、文档弯曲矫正等各项任务。

二. 效果实现

首先给大家展示的是一个PDF截图和对应的标注（红色为标注框）
在这里插入图片描述
下面给标准图片分别添加阴影、摩尔纹、弯曲，效果如下：

摩尔纹+弯曲，并且把标注点映射到弯曲图片上，如下图所示：

阴影+弯曲，并且把标注点映射到弯曲图片上，如下图所示：

三. 算法原理与代码实现

原理：利用渲染工具（推荐blender），渲染出各种弯曲、阴影、摩尔纹，然后再pdf图片上进行合成。
最后，一定要代码实现(只给初级版本，完整版本比较复杂)：

import os
import cv2
import json
import random
import numpy as np
from scipy.interpolate import LinearNDInterpolator as linterp
from scipy.interpolate import NearestNDInterpolator as nearest


class LinearNDInterpolatorExt(object):
    def __init__(self, points, values):
        self.funcinterp = linterp(points, values)
        self.funcnearest = nearest(points, values)

    def __call__(self, *args):
        z = self.funcinterp(*args)
        chk = np.isnan(z)
        if chk.any():
            return np.where(chk, self.funcnearest(*args), z)
        else:
            return z


def crop_flow_from_nan(flow):
    mask = ~np.any(np.isnan(flow), -1)
    x, y, w, h = cv2.boundingRect(mask.astype(np.uint8))
    flow = flow[y: y + h, x: x + w]
    mask = mask[y: y + h, x: x + w]
    max_nonzero_ratio = 0.9
    max_crop_size = 20
    mask_h, mask_w = mask.shape[0], mask.shape[1]
    y0 = max_crop_size
    for i in range(0, max_crop_size):
        if np.count_nonzero(mask[i]) / mask_w > max_nonzero_ratio:
            y0 = i
            break

    y1 = mask_h - 1 - max_crop_size
    for i in range(mask_h - 1, y1, -1):
        if np.count_nonzero(mask[i]) / mask_w > max_nonzero_ratio:
            y1 = i
            break

    crop_mask = mask[y0:y1]
    mask_h, mask_w = crop_mask.shape[0], crop_mask.shape[1]
    x0 = max_crop_size
    for i in range(0, x0):
        if np.count_nonzero(mask[:, i]) / mask_h > max_nonzero_ratio:
            x0 = i
            break

    x1 = mask_w - 1 - max_crop_size
    for i in range(mask_w - 1, x1, -1):
        if np.count_nonzero(mask[:, i]) / mask_h > max_nonzero_ratio:
            x1 = i
            break
    flow = flow[y0:y1, x0:x1]
    return flow


def flow_2_points(flow, pts):
    """
    根据flow映射场反向计算点的对应点
    :param flow: 前向、或后向映射场, range (-1,  1)
    :param pts: 目标图、或原图的坐标点, 点经过归一化 range (0, 1),  shape: (n, 2)
    :return: 原图、或目标图的坐标点, 经过归一化 range (0, 1), shape: (n, 2)
    """
    mask = ~np.any(np.isnan(flow), -1)
    flow_masked = flow[mask]
    flow_w, flow_h = flow.shape[1], flow.shape[0]

    flow_xrange = np.arange(flow_w, dtype=np.float32)
    flow_yrange = np.arange(flow_h, dtype=np.float32)
    flow_xgrid, flow_ygrid = np.meshgrid(flow_xrange, flow_yrange)
    flow_xgrid_masked = flow_xgrid[mask]
    flow_ygrid_masked = flow_ygrid[mask]

    src_pts = (pts - 0.5) * 2  # (0-1) to (-1, 1)
    interpX = LinearNDInterpolatorExt(np.reshape(flow_masked, [-1, 2]), flow_xgrid_masked.reshape(-1))
    interpY = LinearNDInterpolatorExt(np.reshape(flow_masked, [-1, 2]), flow_ygrid_masked.reshape(-1))
    fm_x = interpX(src_pts)
    fm_y = interpY(src_pts)
    # fm_x, fm_y range is (0, flow_w-1)  and (0, flow_h-1), need convert to (0-1)
    fm_x = fm_x / (flow_w - 1)
    fm_y = fm_y / (flow_h - 1)
    return np.stack((fm_x, fm_y), axis=-1)


def warp_img(img, flow, points_list):
    h, w, _ = img.shape
    flow = crop_flow_from_nan(flow)
    flow = flow.astype(np.float32)
    flow = cv2.resize(flow, (256, 256))
    points_list_warp = []
    for points in points_list:
        points = points.astype(np.float64)
        points[:, 0] /= w*1.0
        points[:, 1] /= h*1.0
        points_warp = flow_2_points(flow, points)
        points_warp[:, 0] *= w
        points_warp[:, 1] *= h
        points_list_warp.append(points_warp)

    bm_flow = flow / 2 + 0.5
    bm_flow[..., 0] = bm_flow[..., 0] * w
    bm_flow[..., 1] = bm_flow[..., 1] * h
    bm_flow = np.nan_to_num(bm_flow, nan=-1)
    if bm_flow.shape[0] != h or bm_flow.shape[1] != w:
        bm_flow = cv2.resize(bm_flow, (w, h))

    warp_img = cv2.remap(img, bm_flow.astype(np.float32), None, cv2.INTER_LINEAR, borderValue=(255, 255, 255))
    return warp_img, points_list_warp


def json_2_points(json_path):
    with open(json_path, "r") as f:
        data = json.load(f)
    obj_list = []
    for obj in data[0]['annotations']:
        obj = obj['coordinates']
        cx, cy, w, h = obj['x'], obj['y'], obj['width'], obj['height']
        x1 = cx - 0.5 * w
        x2 = cx + 0.5 * w
        y1 = cy - 0.5 * h
        y2 = cy + 0.5 * h
        points = np.array([[x1,y1], [x2,y1], [x2,y2], [x1,y2]], np.int32)
        obj_list.append(points)
    return obj_list


def add_background(img, img_background):
    height, width, _ = img.shape
    background = cv2.resize(img_background, (width, height))
    img_res = img * 0.5 + background * 0.5
    img_res = np.clip(img_res, 0, 255)
    return img_res


if __name__ == "__main__":
    img = cv2.imread("test.png")
    shadow = cv2.imread("./background/shadow.jpg")
    img = add_background(img, shadow)
    obj_list = json_2_points("test.json")
    flow = np.load("test.npy")
    warp_img, points_list_warp = warp_img(img, flow, obj_list)
    cv2.imwrite("warp_shadow.jpg", warp_img)
    for points in points_list_warp:
        cv2.polylines(warp_img, [points.astype(np.int32)], isClosed=True, color=(0, 0, 255), thickness=1)
    cv2.imwrite("warp_shadow_draw.jpg", warp_img)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149

致谢，在写代码过程中受到了鑫哥的启发，再次表示感谢！
欢迎小伙伴们技术交流~

在这里插入图片描述

相关阅读:
Zabbix (二) --------- Zabbix 部署
 实时数据仓库
 计算机组成原理期末复习第四章-1（唐朔飞）
【软考系统架构设计师】计算机网络④ IPv6
zabbix监控
 前端基础建设与架构05 Vite 实现：从源码分析出发，构建 bundlele 开发工程
 【数据结构与算法分析】0基础带你学数据结构与算法分析08--二叉查找树 (BST)
docker mysql主从搭建
 X86 SMAP(Supervisor Mode Access Prevention)机制引入的一个问题分析
 学习编程的目的
原文地址：https://blog.csdn.net/Guo_Python/article/details/137928295