數據集成平台：datax將hive數據步到mysql（全部列和指定列）

1.py腳本

傳入參數：

target_database：數據庫
target_table：表
target_columns：列
target_positions：hive列的下標（從0開始）

# coding=utf-8
import json
import getopt
import os
import sys
import MySQLdb

# MySQL相关配置，需根据实际情况作出修改
mysql_host = "hadoop102"
mysql_port = "3306"
mysql_user = "root"
mysql_passwd = "xx"

# HDFS NameNode相关配置，需根据实际情况作出修改
hdfs_nn_host = "mycluster"
hdfs_nn_port = "8020"


def get_connection():
    return MySQLdb.connect(host=mysql_host, port=int(mysql_port), user=mysql_user, passwd=mysql_passwd)


def get_mysql_meta(database, table, columns):
    connection = get_connection()
    cursor = connection.cursor()
    if columns == 'all':
        # 如果传入 '*' 表示要所有列
        sql = "SELECT COLUMN_NAME, DATA_TYPE FROM information_schema.COLUMNS WHERE TABLE_SCHEMA='%s' AND TABLE_NAME='%s' ORDER BY ORDINAL_POSITION" % (database, table)
    else:
        # 传入指定列
        # 将每个列名加上单引号
        columns = ', '.join("'%s'" % col.strip() for col in columns.split(','))
        sql = "SELECT COLUMN_NAME, DATA_TYPE FROM information_schema.COLUMNS WHERE TABLE_SCHEMA='%s' AND TABLE_NAME='%s' AND COLUMN_NAME IN (%s) ORDER BY ORDINAL_POSITION" % (
        database, table, columns)
    cursor.execute(sql)
    fetchall = cursor.fetchall()
    # print(fetchall)
    cursor.close()
    connection.close()
    return fetchall


def get_mysql_columns(database, table, target_columns):
    return map(lambda x: x[0], get_mysql_meta(database, table, target_columns))


def get_hive_columns(database, table, target_columns, target_positions):
    def type_mapping(mysql_type):
        mappings = {
            "bigint": "bigint",
            "int": "bigint",
            "smallint": "bigint",
            "tinyint": "bigint",
            "mediumint": "bigint",
            "decimal": "string",
            "double": "double",
            "float": "float",
            "binary": "string",
            "char": "string",
            "varchar": "string",
            "datetime": "string",
            "time": "string",
            "timestamp": "string",
            "date": "string",
            "text": "string",
            "bit": "string",
        }
        return mappings[mysql_type]

    meta = get_mysql_meta(database, table, target_columns)

    if target_columns == 'all':
        return map(lambda x: {"name": x[0], "type": type_mapping(x[1].lower())}, meta)
    else:
        positions = list(map(int, target_positions.split(',')))
        return map(lambda x, i: {"index": positions[i], "type": type_mapping(x[1].lower())}, meta, range(len(meta)))


def generate_json(target_database, target_table, target_columns, target_positions):
    print(get_hive_columns(target_database, target_table, target_columns, target_positions))
    if target_columns == 'all':
        target_columns_hive = "[*]"
    else:
        target_columns_hive = get_hive_columns(target_database, target_table, target_columns, target_positions)
    job = {
        "job": {
            "setting": {
                "speed": {
                    "channel": 15
                },
                "errorLimit": {
                    "record": 0,
                    "percentage": 0.02
                }
            },
            "content": [{
                "reader": {
                    "name": "hdfsreader",
                    "batchSize": "8192",
                    "batchByteSize": "33554432",
                    "parameter": {
                        "path": "${exportdir}",
                        "defaultFS": "hdfs://" + hdfs_nn_host + ":" + hdfs_nn_port,
                        "column": target_columns_hive,
                        "fileType": "orc",
                        "encoding": "UTF-8",
                        "fieldDelimiter": u"\u0001",
                        "nullFormat": "\\N"
                    }
                },
                "writer": {
                    "name": "mysqlwriter",
                    "batchSize": "8192",
                    "batchByteSize": "33554432",
                    "parameter": {
                        "writeMode": "replace",
                        "username": mysql_user,
                        "password": mysql_passwd,
                        "column": get_mysql_columns(target_database, target_table, target_columns),
                        "connection": [
                            {
                                "jdbcUrl":
                                    "jdbc:mysql://" + mysql_host + ":" + mysql_port + "/" + target_database + "?useUnicode=true&characterEncoding=utf-8&useSSL=false",
                                "table": [target_table]
                            }
                        ]
                    }
                }
            }]
        }
}

    output_path = "/opt/module/datax/job/export/" + target_database
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    with open(os.path.join(output_path, ".".join([target_database, target_table, "json"])), "w") as f:
        json.dump(job, f)


def main(args):
    target_database = ""
    target_table = ""
    target_columns = ""  # 默认为 None，表示没有指定列信息
    target_positions = ""

    options, arguments = getopt.getopt(args, 'p:d:t:c:', ['positions=', 'targetdb=', 'targettbl=', 'columns='])
    for opt_name, opt_value in options:
        if opt_name in ('-d', '--targetdb'):
            target_database = opt_value
        if opt_name in ('-t', '--targettbl'):
            target_table = opt_value
        if opt_name in ('-c', '--columns'):
            target_columns = opt_value
        if opt_name in ('-p', '--positions'):
            target_positions = opt_value
    print(target_database, target_table, target_columns, target_positions)
    generate_json(target_database, target_table, target_columns, target_positions)


if __name__ == '__main__':
    main(sys.argv[1:])

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162

2.sh腳本

#!/bin/bash
python ~/bin/test.py -d db-t table -c all
#kunnr,name1,sort2,addrnumber,country,state -p 0,1,2,3,4,5
#all


1
2
3
4
5
6

相关阅读:
如何正确选择ARM核心板的存储类型
 networking /etc/network/interfaces 笔记221102
会务转化如何取得“数字化”突破？会务营销数字化功能推荐
 k8s--基础--21--Statefulset
MATLAB_双馈风力发电机-900V直流混合储能并网系统MATLAB仿真
 计算机系统基础期末复习
 三七互娱，oppo，快手25届暑期实习内推
 借鸡下蛋：室内定位之基于众包采集的 WiFi 指纹地图
 【运维】fstab,systemctl与rc.local启动顺序
 第五十八章学习常用技能 - 查看查询缓存
原文地址：https://blog.csdn.net/m0_37759590/article/details/136270779