• 自恢复集群启动命令oninitdb的设计与实现:


    自恢复集群启动命令oninitdb的设计与实现:

    • 出发点

      • 解决gbase8s集群在莫节点宕机后,拉起数据库不能自动恢复集群的问题。
    • 解题思路

      • 用oninitdb命令替代oninit命令

    创建gbase01/02/03/cm1/cm2容器

    Dockerfile参考

    FROM centos:7.8.2003
    RUN groupadd -g 1000 gbasedbt && \
        useradd -g gbasedbt -u 1000 -d /home/gbase -m -s /bin/bash gbasedbt && \
         echo "gbasedbt:GBase123" | chpasswd
    COPY profile /home/gbase/.bash_profile
    COPY gbasedbtjdbc_3.3.0_2.jar /home/gbase/gbasedbtjdbc_3.3.0_2.jar
    ADD GBase8sV8.8_3.3.0_2CSDK.tar.xz /opt
    EXPOSE 9088 9200 9300
    CMD su - gbasedbt -c "oninit" && /bin/bash
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9

    重新打包gbase生成image

    docker run -itd  --name gbase liaosnet/gbase8s /bin/bash
    docker start gbase
    docker exec -it gbase /bin/bash
    
    -rw-r--r--. 1 root     root     2.4M Mar 31  2021 gbasedbtjdbc_3.3.0_2.jar
    -rwxr-xr-x. 1 gbasedbt gbasedbt  871 Jul 27 15:39 g01.init.instance.name.sh
    -rwxr-xr-x. 1 gbasedbt gbasedbt 6.5M Aug  6 04:50 oninitdb
    -rwxr-xr-x. 1 gbasedbt gbasedbt 9.7M Aug 12 05:21 info5wangwei
    -rwxr-xr-x. 1 gbasedbt gbasedbt 1.8K Aug 13 00:30 Ontape-s-L0primary.sh
    -rwxr-xr-x. 1 gbasedbt gbasedbt 2.1K Aug 13 00:34 Ontape-p0seconda.sh
    -rwxr-xr-x. 1 gbasedbt gbasedbt 5.4M Aug 13 01:08 ftpServer
    -rwxr-xr-x. 1 gbasedbt gbasedbt  86K Aug 15 02:05 ftpClient
    -rwxr-xr-x. 1 gbasedbt gbasedbt   32 Aug 17 11:52 start.sh
    -rwxr-xr-x. 1 gbasedbt gbasedbt   50 Aug 17 11:53 qu.info5.servername.sh
    -rwxr-xr-x. 1 gbasedbt gbasedbt  309 Aug 17 11:53 qu.info5.running.sh
    -rwxr-xr-x. 1 gbasedbt gbasedbt  167 Aug 17 11:54 qu.info5.mode.sh
    -rwxr-xr-x. 1 gbasedbt gbasedbt  155 Aug 17 11:54 ftpUseHelp.sh
    drwxr-xr-x. 2 gbasedbt gbasedbt   49 Aug 17 11:55 config
    -rwxrwxrwx. 1 gbasedbt gbasedbt  28M Aug 17 12:07 b0
    
    docker stop gbase
    docker commit -a "gbase.com" -m "my gbase" gbase gbase:v2 
    
    [root@wei3 homegbase]# docker images
    REPOSITORY                   TAG                 IMAGE ID            CREATED             SIZE
    gbase                        v2                  95825e96d1a5        11 seconds ago      1.46 GB
    gbase                        v1                  fa01ddf56c88        3 hours ago         2.05 GB
    docker.io/liaosnet/gbase8s   latest              11a04a231660        2 months ago        1.36 GB
    
    #在已有镜像系统上导出镜像
    docker save gbase:v2 > /opt/gbase_v2.tar
    并将tar文件保存到 D:\01.GBase\2022GBASE工作记录\04.文档介质\docker
    #镜像文件md5值:
    189d3e3752df3e21c8c4bc1ca1b2bb41  gbase8s_latest.tar
    82ea433442ed56eda2fa05ee8ca3270f  /opt/gbase_v2.tar
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35

    docker run

    ( 1. shell   2.rhosts  3.运行shell  4.INDEX)
    docker run -p 16060:6060 -p 19088:19088 --name gbase001 -itd gbase:v1 su - gbasedbt -c "/home/gbase/oninitdb; /home/gbase/start.sh ; /home/gbase/info5wangwei "
    
    docker run -p 26060:6060 -p 29088:29088 --name gbase002 -itd gbase:v1 su - gbasedbt -c "/home/gbase/oninitdb; /home/gbase/start.sh ; /home/gbase/info5wangwei "
    
    docker run -p 36060:6060 -p 39088:39088 --name gbase003 -itd gbase:v1 su - gbasedbt -c "/home/gbase/oninitdb; /home/gbase/start.sh ; /home/gbase/info5wangwei "
    
    docker run -p 19099:19099 --name cm1 -itd gbase:v1 su - gbasedbt -c "oncmsm -c /opt/gbase/etc/cm1.cfg;/home/gbase/start.sh;/bin/bash"
    
    docker run -p 29099:29099 --name cm2 -itd gbase:v1 su - gbasedbt -c "oncmsm -c /opt/gbase/etc/cm2.cfg;/home/gbase/start.sh;/bin/bash"
    
    
    说明: 
    	1、  .6060 端口用作集群信息通信,和集群管理命令发布。
        2、  .9088 端口用作数据库监听
        3、  .9099 端口用作连接管理器,业务可以通过这两个端口可以自动连接到数据库集群的主节点。(数据库主节点代理端口)
        
    ##另外,没个机器上来个client吧,呵呵:
    docker run --name client1 -itd liaosnet/gbase8s /bin/bash
    docker run --name client2 -itd liaosnet/gbase8s /bin/bash
    docker exec -it client1 /bin/bash
    docker exec -it client2 /bin/bash
    cat <<! > $GBASEDBTDIR/etc/sqlhosts
    db_group        group   -       -       i=1
    gbase001    onsoctcp  172.20.97.3 19088 g=db_group
    gbase002    onsoctcp  172.20.97.3 29088 g=db_group
    gbase003    onsoctcp  172.20.97.4 39088 g=db_group
    
    cm_update       group   -       -       i=2
    oltp_update1    onsoctcp 172.20.97.3      19099 g=cm_update
    oltp_update2    onsoctcp 172.20.97.4      29099 g=cm_update
    !
    ## 测试
    dbaccess testdb@cm_update <<!
    select count(*) from systables;
    !
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36

    .bash_profile

    # .bash_profile
    
    # Get the aliases and functions
    if [ -f ~/.bashrc ]; then
        . ~/.bashrc
    fi
    
    # User specific environment and startup programs
    
    PATH=$PATH:$HOME/.local/bin:$HOME/bin
    
    export PATH
    export GBASEDBTDIR=/opt/gbase
    export GBASEDBTSERVER=gbase001
    export ONCONFIG=onconfig.$GBASEDBTSERVER
    export PATH=$GBASEDBTDIR/bin:${PATH}
    export GBASEDBTSQLHOSTS=/opt/gbase/etc/sqlhosts
    export LD_LIBRARY_PATH=$GBASEDBTDIR/lib:$GBASEDBTDIR/lib/cli:$GBASEDBTDIR/lib/esql:$LD_LIBRARY_PATH
    
    export DB_LOCALE=zh_CN.utf8
    export CLIENT_LOCALE=zh_CN.utf8
    export GL_USEGLU=1
    export DBDATE=Y4MD-
    export DBACCESS_SHOW_TIME=1
    export PS1=gbasedbt'[gbase001]$'
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25

    sqlhosts

    gbase001 onsoctcp 172.20.97.4 19088
    gbase002 onsoctcp 172.20.97.4 29088
    gbase003 onsoctcp 172.20.97.4 39088
    
    • 1
    • 2
    • 3

    sqlhosts.cm

    cat <<! > $GBASEDBTDIR/etc/sqlhosts.cm
    db_group        group   -       -       i=1
    gbase001    onsoctcp  172.20.97.3 19088 g=db_group
    gbase002    onsoctcp  172.20.97.3 29088 g=db_group
    gbase003    onsoctcp  172.20.97.4 39088 g=db_group
    
    cm_update       group   -       -       i=2
    oltp_update1    onsoctcp 172.20.97.3      19099 g=cm_update
    oltp_update2    onsoctcp 172.20.97.4      29099 g=cm_update
    !
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10

    构建HDR和RSS集群

    root shell

    ​ 统一目录: /root

    调整容器root环境:r01.set.os.sh
    #!/bin/bash
    #放在容器里面的root用户的shell
    #docker exec -it gbase001 /bin/bash
    
    chown gbasedbt:gbasedbt /home/gbase/.bash_profile
    
    sed -i '/gbasedbt soft nproc 65535/d' /etc/security/limits.conf
    sed -i '/gbasedbt hard nproc 65535/d' /etc/security/limits.conf
    sed -i '/gbasedbt soft nofile 1048576/d' /etc/security/limits.conf
    sed -i '/gbasedbt hard nofile 1048576/d' /etc/security/limits.conf
    
    sed -i '/root soft nproc 65535/d' /etc/security/limits.conf
    sed -i '/root hard nproc 65535/d' /etc/security/limits.conf
    sed -i '/root soft nofile 1048576/d' /etc/security/limits.conf
    sed -i '/root hard nofile 1048576/d' /etc/security/limits.conf
    
    
    cat >> /etc/security/limits.conf << EOF
    
    gbasedbt soft nproc 65535
    gbasedbt hard nproc 65535
    gbasedbt soft nofile 1048576
    gbasedbt hard nofile 1048576
    root soft nproc 65535
    root hard nproc 65535
    root soft nofile 1048576
    root hard nofile 1048576
    EOF
    
    touch /gbasedbt
    touch /root/gbasedbt
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    #使用
    ./r01.set.os.sh
    
    • 1
    • 2

    gbasedbt shell

    #使用
    docker exec -it gbase001 /bin/bash
    docker exec -it gbase002 /bin/bash
    docker exec -it gbase003 /bin/bash
    docker exec -it cm1 /bin/bash
    docker exec -it cm2 /bin/bash
    sh g01.init.instance.name.sh gbase001 172.20.97.3
    sh g01.init.instance.name.sh gbase002 172.20.97.3
    sh g01.init.instance.name.sh gbase003 172.20.97.4
    sh g01.init.instance.name.sh cm1 172.20.97.3
    sh g01.init.instance.name.sh cm2 172.20.97.4
    . .bash_profile 
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13

    ​ 统一目录: /home/gbase

    docker cp gbase.home.docker.shell.tar.gz gbase001:/home/gbase

    修改实例名称: g01.init.instance.name.sh
    #!/bin/bash
    #放在容器里面的gbasedbt用户的shell
    #docker exec -it gbase001 /bin/bash
    #su - gbasedbt
    
    INSTANCE=$1
    IPADDR=$2
    
    sed -i "s/^export GBASEDBTSERVER=.*$/export GBASEDBTSERVER=$INSTANCE/" ~/.bash_profile
    sed -i '/export PS1/d' ~/.bash_profile
    echo "export PS1=`whoami`'[$INSTANCE]\$'" >> ~/.bash_profile
    . ~/.bash_profile
    
    cp $GBASEDBTDIR/etc/onconfig.gbase01 $GBASEDBTDIR/etc/$ONCONFIG
    sed -i "s/^DRAUTO.*$/DRAUTO 3/" $GBASEDBTDIR/etc/$ONCONFIG
    sed -i "s/DBSERVERNAME gbase01/DBSERVERNAME $INSTANCE/" $GBASEDBTDIR/etc/$ONCONFIG
    sed -i "s/^LOG_INDEX_BUILDS.*$/LOG_INDEX_BUILDS 1/" $GBASEDBTDIR/etc/$ONCONFIG
    
    cat > $GBASEDBTDIR/etc/sqlhosts << EOF
    gbase001 onsoctcp $IPADDR 19088 
    gbase002 onsoctcp $IPADDR 29088 
    gbase003 onsoctcp $IPADDR 39088 
    EOF
    echo "sed -i 's/^$INSTANCE onsoctcp $IPADDR/$INSTANCE onsoctcp 0.0.0.0/' $GBASEDBTDIR/etc/sqlhosts" |sh
    
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    0级别备份主节点: g02.ontape.s.L0.sh
    #!/bin/bash
    #放在容器里面的gbasedbt用户的shell
    #docker exec -it gbase001 /bin/bash
    #su - gbasedbt
    
    rm -f 0b
    touch 0b
    chmod 777 0b
    ontape -s -L 0 -t 0b
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    宿主机分发0b数据到2个备节点: s01.docker.cp.sh

    ​ 统一目录: 宿主机的 /root

    docker cp gbase001:/home/gbase/0b /tmp/
    docker cp /tmp/0b gbase002:/home/gbase/
    docker cp /tmp/0b gbase003:/home/gbase/
    
    
    • 1
    • 2
    • 3
    • 4
    备节点物理恢复ontap: g02.ontape.p.sh
    chown gbasedbt. /home/gbase/0b
    su - gbasedbt
    ontape -p -t 0b
    #y n n
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    建立3节点集群RSS
    #每节点
    cat > ~/.rhosts << EOF
    +
    EOF
    
    #gbase001主
    oninit -v
    onmode -d primary gbase002
    onmode -d primary gbase003
    #gbase002RSS
    oninit -PHY
    onmode -d secondary gbase001
    onmode -d RSS gbase001
    #gbase003RSS
    oninit -PHY
    onmode -d secondary gbase001
    onmode -d RSS gbase001
    
    
    #gbase001主
    onmode -d add RSS gbase002
    onmode -d add RSS gbase003
    #gbase002RSS
    onmode -d RSS gbase001
    #gbase003RSS
    onmode -d RSS gbase001
    
    
    #查看结果
    onstat -g cluster
    onstat -g dri
    onstat -g rss
    
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34

    构建CM集群

    1. 构建cm是,让集群自动故障切换。
    2. 插件启动程序oninitdb是,让节点再宕机后启动时自动恢复集群。
    3. 业务只需要连接 cm_update ,就可以保证不管是cm1/cm2坏了,还是gbase01/gbase02/gbase03坏了。业务都正常。

    CM1和CM2环境配置

    #这量2哥容器上面已经建立了
    docker exec -it cm1 /bin/bash
    docker exec -it cm2 /bin/bash
    
    #修改这量容器里面的.profile环境变量
    export GBASEDBTSERVER=gbase001
    export GBASEDBTSERVER=gbase001
    export GBASEDBTSQLHOSTS=/opt/gbase/etc/sqlhosts.cm1
    export GBASEDBTSQLHOSTS=/opt/gbase/etc/sqlhosts.cm2
    export PS1=`whoami`'[cm1]\$'
    export PS1=`whoami`'[cm2]\$'
    
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    CM1配置文件:sqlhosts.cm
    cat <<! > $GBASEDBTDIR/etc/sqlhosts.cm1
    db_group        group   -       -       i=1
    gbase001    onsoctcp  172.20.97.3 19088 g=db_group
    gbase002    onsoctcp  172.20.97.3 29088 g=db_group
    gbase003    onsoctcp  172.20.97.4 39088 g=db_group
    
    cm_update       group   -       -       i=2
    oltp_update1    onsoctcp 0.0.0.0      19099 g=cm_update
    oltp_update2    onsoctcp 172.20.97.4      29099 g=cm_update
    !
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    CM1配置文件:cm1.cfg
    cat <<! >$GBASEDBTDIR/etc/cm1.cfg
    NAME            cm1
    LOGFILE         ${GBASEDBTDIR}/cm1.log
    CM_TIMEOUT 30
    SECONDARY_EVENT_TIMEOUT 30
    SQLHOSTS LOCAL
    LOCAL_IP 172.20.97.3
    EVENT_TIMEOUT 30
    LOG 1
    DEBUG 0
    CLUSTER CLUSTER1
    {
    GBASEDBTSERVER       db_group
    SLA oltp_update1 DBSERVERS=PRI
    FOC ORDER=ENABLED TIMEOUT=1 RETRY=2 PRIORITY=98
    }
    !
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    启动CM1
    oncmsm -c $GBASEDBTDIR/etc/cm1.cfg
    
    
    • 1
    • 2
    CM2配置文件:sqlhosts.cm
    cat <<! > $GBASEDBTDIR/etc/sqlhosts.cm2
    db_group        group   -       -       i=1
    gbase001    onsoctcp  172.20.97.3 19088 g=db_group
    gbase002    onsoctcp  172.20.97.3 29088 g=db_group
    gbase003    onsoctcp  172.20.97.4 39088 g=db_group
    
    cm_update       group   -       -       i=2
    oltp_update1    onsoctcp 172.20.97.3      19099 g=cm_update
    oltp_update2    onsoctcp 0.0.0.0      29099 g=cm_update
    !
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    CM2配置文件:cm2.cfg
    cat <<! >$GBASEDBTDIR/etc/cm2.cfg
    NAME            cm2
    LOGFILE         ${GBASEDBTDIR}/cm2.log
    CM_TIMEOUT 30
    SECONDARY_EVENT_TIMEOUT 30
    SQLHOSTS LOCAL
    LOCAL_IP 172.20.97.4
    EVENT_TIMEOUT 30
    LOG 1
    DEBUG 0
    CLUSTER CLUSTER2
    {
    GBASEDBTSERVER       db_group
    SLA oltp_update2 DBSERVERS=PRI
    FOC ORDER=ENABLED TIMEOUT=1 RETRY=2 PRIORITY=99
    }
    !
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    启动CM2
    oncmsm -c $GBASEDBTDIR/etc/cm2.cfg
    
    
    • 1
    • 2

    配置完成结果

    gbasedbt[gbase01]$onstat -g cmsm
    On-Line (Prim) -- Up 02:22:30 -- 833360 Kbytes
    Unified Connection Manager: cm1                      Hostname: gateway
    
    CLUSTER         CLUSTER1        LOCAL
            GBasedbt Servers: db_group
            SLA                    Connections   Service/Protocol   Rule
            oltp_update1                     1      9099/onsoctcp   DBSERVERS=PRI
    
            Failover Arbitrator: Active Arbitrator, Primary is up
            ORDER=SDS,HDR,RSS PRIORITY=98 TIMEOUT=1
    
    Unified Connection Manager: cm2                      Hostname: gateway
    
    CLUSTER         CLUSTER2        LOCAL
            GBasedbt Servers: db_group
            SLA                    Connections   Service/Protocol   Rule
            oltp_update2                     0      9099/onsoctcp   DBSERVERS=PRI
    
            Failover Arbitrator: Failover is enabled
            ORDER=SDS,HDR,RSS PRIORITY=99 TIMEOUT=1
    
    
    gbasedbt[gbase01]$
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25

    关闭cm1测试结果

    gbasedbt[cm1]$oncmsm -k cm1
    Shut down Connection Manager cm1
    gbasedbt[cm1]$
    
    
    • 1
    • 2
    • 3
    • 4
    gbasedbt[gbase01]$onstat -g cmsm
    On-Line (Prim) -- Up 02:27:14 -- 833360 Kbytes
    Unified Connection Manager: cm2                      Hostname: gateway
    
    CLUSTER         CLUSTER2        LOCAL
            GBasedbt Servers: db_group
            SLA                    Connections   Service/Protocol   Rule
            oltp_update2                     1      9099/onsoctcp   DBSERVERS=PRI
    
            Failover Arbitrator: Active Arbitrator, Primary is up
            ORDER=SDS,HDR,RSS PRIORITY=99 TIMEOUT=1
    
    
    gbasedbt[gbase01]$
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15

    其他备注

    #遇到错误
    gbasedbt[gbase01]$onstat -
    -bash: fork: retry: No child processes
    -bash: fork: retry: No child processes
    
    #处理办法,到docker容器里面配置。当然宿主机也要配置。
    [root@localhost ~]#vi /etc/security/limits.conf
    gbasedbt	soft	nproc	65535
    gbasedbt	hard	nproc	65535
    gbasedbt	soft	nofile	1048576
    gbasedbt	hard	nofile	1048576
    root	soft	nproc	65535
    root	hard	nproc	65535
    root	soft	nofile	1048576
    root	hard	nofile	1048576
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    #rss得日志里面报错:
    05:52:59  listener-thread: err = -956: oserr = 0: errstr = gbasedbt@gateway[7178077f54c9]: Client host or user gbasedbt@gateway[7178077f54c9] is not trusted by the server.
    05:53:07  listener-thread: err = -956: oserr = 0: errstr = gbasedbt@gateway[ac5159d24585]: Client host or user gbasedbt@gateway[ac5159d24585] is not trusted by the server.
    05:53:12  listener-thread: err = -956: oserr = 0: errstr = gbasedbt@gateway[ac5159d24585]: Client host or user gbasedbt@gateway[ac5159d24585] is not trusted by the server.
    #同时cm日志里面报错
    05:52:26 The server type of cluster CLUSTER1 server gbase03 is RSS.
    05:52:39 The server type of cluster CLUSTER1 server gbase03 is RSS.
    05:52:59 The server type of cluster CLUSTER1 server gbase03 is RSS.
    ...两个cm都报这个
    05:51:03 The server type of cluster CLUSTER2 server gbase03 is RSS.
    05:51:09 The server type of cluster CLUSTER2 server gbase03 is RSS.
    05:51:22 The server type of cluster CLUSTER2 server gbase03 is RSS.
    
    #处理方法添加.rhosts
    05:56:39 Connection Manager successfully connected to gbase03
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16

    编写插件启动程序–oninitdb

    ​ 此时重复上面的测试关闭001的docker,再次启动docker,可以启动,但是不能oninit启动数据库。应为数据库集群里面已经有了新的主节点。此时只是需要写oninitdb插件启动脚本,用他来替代oninit即可。

    配置插件启动程序的前提环境

    ​ 容器间的命令和信息如何共享,不同宿主机,用docker cp 路径太长, 用ssh需要容器内配置sshd服务, 都不是很好。 那么用REST API吧。

    程序和AIP设计

    每个节点暴露自己的信息 (info5)

    servername:
    mode:
    running:
    
    
    • 1
    • 2
    • 3
    • 4

    节点间API传递命令 (info5)

    /primary
    /secondary
    /addrss
    /rss
    /phy
    /killy
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7

    oninitdb启动插件 (oninitdb)

    ​ 根据上面暴露的信息,判断应该如何启动并加入集群;再调用api实现之。

    逐一访问没个节点暴露的信息
    知道001挂了
    知道002是主
    知道003是rss
    去PHY启动001
    去002指定添加一个rss
    去001指定角色为rss
    等待集群恢复完成
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9

    程序和AIP实现

    info5 web api 服务(把此info5服务设置为 随docker启动)

    package main
    
    import (
    	"bytes"
    	"fmt"
    	"os/exec"
    
    	"github.com/gin-gonic/gin"
    )
    
    type Info struct {
    	servername string
    	mode       string
    	running    string
    }
    
    var info = &Info{
    	servername: "",
    	mode:       "",
    	running:    "",
    }
    
    func main() {
    
    	r := gin.Default()
    	r.GET("/info", func(c *gin.Context) {
    		info.servername = Shell("qu.info5.servername.sh")
    		info.mode = Shell("qu.info5.mode.sh")
    		info.running = Shell("qu.info5.running.sh")
    		c.JSON(200, gin.H{
    			"servername": info.servername,
    			"mode":       info.mode,
    			"running":    info.running,
    		})
    	})
    
    	r.GET("/addrss", func(ctx *gin.Context) {
    		my := ctx.Query("my")
    		if my == "" {
    			ctx.String(200, "my is null")
    			return
    		}
    		cmd := exec.Command("onmode", "-d", "add", "RSS", my)
    		var stdout, stderr bytes.Buffer
    		cmd.Stdout = &stdout // 标准输出
    		cmd.Stderr = &stderr // 标准错误
    		err := cmd.Run()
    		outStr, errStr := stdout.String(), stderr.String()
    		fmt.Printf("out:\n%s\nerr:\n%s\n", outStr, errStr)
    		if err != nil {
    			fmt.Printf("cmd.Run() failed with %s\n", err)
    			ctx.String(200, "run addrss ok")
    		} else {
    			ctx.String(200, "run addrss err")
    		}
    	})
    
    	r.Run(":6060")
    
    }
    
    func Shell(shellfile string) string {
    	cmd := exec.Command("sh", shellfile)
    	var stdout, stderr bytes.Buffer
    	cmd.Stdout = &stdout // 标准输出
    	cmd.Stderr = &stderr // 标准错误
    	err := cmd.Run()
    	outStr, errStr := stdout.String(), stderr.String()
    	fmt.Printf("out:\n%s\nerr:\n%s\n", outStr, errStr)
    	if err != nil {
    		fmt.Printf("cmd.Run() failed with %s\n", err)
    	}
    	return outStr
    }
    
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76

    oninitdb 命令(把此oninitdb加入docker里/opt/gbase/bin/目录)

    package main
    
    import (
    	"bytes"
    	"encoding/json"
    	"fmt"
    	"io/ioutil"
    	"net/http"
    	"os/exec"
    )
    
    type Info struct {
    	Servername string `JSON:"servername"`
    	Mode       string `JSON:"mode"`
    	Running    string `JSON:"running"`
    }
    
    var infoList [3]Info
    
    type web struct {
    	servername string
    	ip         string
    	port       string
    }
    
    var webList [3]web
    
    func init() {
    	webList[0].servername = "gbase001"
    	webList[0].ip = "172.20.97.4"
    	webList[0].port = "16060"
    
    	webList[1].servername = "gbase002"
    	webList[1].ip = "172.20.97.4"
    	webList[1].port = "26060"
    
    	webList[2].servername = "gbase003"
    	webList[2].ip = "172.20.97.4"
    	webList[2].port = "36060"
    }
    
    func main() {
    
    	//我是谁
    	fmt.Println("我是:")
    	my := Shell("qu.info5.servername.sh")
    	myrunning := ""
    	zhu := "null"
    	zhuip := ""
    	zhuport := ""
    
    	//读取集群信息
    	info()
    
    	//如果我没有宕机,什么也不做
    	for _, v := range infoList {
    		if v.Servername == my {
    			myrunning = v.Running
    			break
    		}
    	}
    	if myrunning == "OK" {
    		fmt.Println("我运行好好的不用任何操作")
    		return
    	}
    
    	//主是谁
    	for _, v := range infoList {
    		if v.Mode == "Primary" && v.Running == "OK" {
    			zhu = v.Servername
    			fmt.Println("主是:", zhu)
    			break
    		}
    	}
    	for _, v := range webList {
    		if v.servername == zhu {
    			zhuip = v.ip
    			zhuport = v.port
    			fmt.Println("主的webIP是:", zhuip)
    			fmt.Println("主的webPort是:", zhuport)
    			break
    		}
    	}
    
    	//如果主还是空,那么什么也不做
    	//多节点集群的目的就是永远有一个主节点存活,对外提供服务。
    	if zhu == "null" {
    		fmt.Println("集群里面没有OK的主节点")
    		return
    	}
    
    	//启动前clean一遍
    	fmt.Println("清理残留")
    	cmd := exec.Command("onclean", "-ky")
    	var stdout, stderr bytes.Buffer
    	cmd.Stdout = &stdout // 标准输出
    	cmd.Stderr = &stderr // 标准错误
    	err := cmd.Run()
    	outStr, errStr := stdout.String(), stderr.String()
    	fmt.Printf("out:\n%s\nerr:\n%s\n", outStr, errStr)
    	if err != nil {
    		fmt.Printf("cmd.Run() failed with %s\n", err)
    	}
    
    	//启动到phy
    	cmd = exec.Command("oninit", "-PHY")
    	// var stdout, stderr bytes.Buffer
    	cmd.Stdout = &stdout // 标准输出
    	cmd.Stderr = &stderr // 标准错误
    	err = cmd.Run()
    	// outStr, errStr = stdout.String(), stderr.String()
    	// fmt.Printf("out:\n%s\nerr:\n%s\n", outStr, errStr)
    	fmt.Println("物理启动")
    	if err != nil {
    		fmt.Printf("cmd.Run() failed with %s\n", err)
    	}
    
    	//主addrss
    	fmt.Println("远程添加")
    	u := "http://" + zhuip + ":" + zhuport + "/addrss?my=" + my
    	fmt.Println(u)
    	response, err := http.Get(u)
    	if err != nil {
    		fmt.Println("http.Get addrss 时遇到异常")
    	}
    	buf, _ := ioutil.ReadAll(response.Body)
    	s := string(buf)
    	if s == "run addrss ok" {
    		fmt.Println("run addrss ok")
    	} else {
    		fmt.Println("run addrss err")
    	}
    
    	//我加入集群
    	cmd = exec.Command("onmode", "-d", "RSS", zhu)
    	// var stdout, stderr bytes.Buffer
    	cmd.Stdout = &stdout // 标准输出
    	cmd.Stderr = &stderr // 标准错误
    	err = cmd.Run()
    	// outStr, errStr = stdout.String(), stderr.String()
    	// fmt.Printf("out:\n%s\nerr:\n%s\n", outStr, errStr)
    	fmt.Println("加入集群")
    	if err != nil {
    		fmt.Printf("cmd.Run() failed with %s\n", err)
    	}
    
    	//再次查看db节点info
    	info()
    }
    
    func info() {
    	//查看db节点infof
    	fmt.Println("查看集群")
    	for i, n := range webList {
    		response, err := http.Get("http://" + n.ip + ":" + n.port + "/info")
    		if err != nil {
    			fmt.Println("info API 未正常提供服务,Docker可能已经异常。")
    			continue
    		}
    		buf, _ := ioutil.ReadAll(response.Body)
    		json.Unmarshal(buf, &infoList[i])
    		fmt.Println(infoList[i])
    	}
    }
    
    func Shell(shellfile string) string {
    	cmd := exec.Command("sh", shellfile)
    	var stdout, stderr bytes.Buffer
    	cmd.Stdout = &stdout // 标准输出
    	cmd.Stderr = &stderr // 标准错误
    	err := cmd.Run()
    	outStr, errStr := stdout.String(), stderr.String()
    	fmt.Printf("out:\n%s\nerr:\n%s\n", outStr, errStr)
    	if err != nil {
    		fmt.Printf("cmd.Run() failed with %s\n", err)
    	}
    	return outStr
    }
    
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119
    • 120
    • 121
    • 122
    • 123
    • 124
    • 125
    • 126
    • 127
    • 128
    • 129
    • 130
    • 131
    • 132
    • 133
    • 134
    • 135
    • 136
    • 137
    • 138
    • 139
    • 140
    • 141
    • 142
    • 143
    • 144
    • 145
    • 146
    • 147
    • 148
    • 149
    • 150
    • 151
    • 152
    • 153
    • 154
    • 155
    • 156
    • 157
    • 158
    • 159
    • 160
    • 161
    • 162
    • 163
    • 164
    • 165
    • 166
    • 167
    • 168
    • 169
    • 170
    • 171
    • 172
    • 173
    • 174
    • 175
    • 176
    • 177
    • 178
    • 179
    • 180

    API调用的脚本

    取severname等3: cat > qu.info5.sh
    echo $GBASEDBTSERVER | awk '{print "info5 servername "$1}'
    
    n=`onstat -g rss |wc -l ` 
    if [ $n -gt 2  ]; then
    	onstat -g rss |grep "Local server type" | awk '{print "info5 mode "$4}'
    else
    	echo "info5 mode unknown"
    fi
    
    export INFORMIXCONTIME=3
    export INFORMIXCONRETRY=0
    export TMPRINTOFFSETS=3
    export GBASEDBTCONTIME=3
    export GBASEDBTCONRETRY=0
    dbaccess sysmaster <<! >/dev/null 2>&1
    	select first 1 1 from systables;
    !
    i=$?
    if [ $i -eq 0 ]; then
    	echo "info5 running OK"
    else
    	echo "info5 running ERR"
    fi
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    取severname等1/3: cat > qu.info5.servername.sh
    echo $GBASEDBTSERVER | awk '{printf("%s",$1)}'
    
    
    • 1
    • 2
    取severname等2/3: cat > qu.info5.mode.sh
    n=`onstat -g rss |wc -l ` 
    if [ $n -gt 2  ]; then
    	onstat -g rss |grep "Local server type" |  awk '{printf("%s",$4)}'
    else
    	awk 'BEGIN {printf("%s","unknown")}'
    fi
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    取severname等3/3: cat > qu.info5.running.sh
    export INFORMIXCONTIME=3
    export INFORMIXCONRETRY=0
    export TMPRINTOFFSETS=3
    export GBASEDBTCONTIME=3
    export GBASEDBTCONRETRY=0
    dbaccess sysmaster <<! >/dev/null 2>&1
    	select first 1 1 from systables;
    !
    i=$?
    if [ $i -eq 0 ]; then
    	awk 'BEGIN {printf("%s","OK")}'
    else
    	awk 'BEGIN {printf("%s","ERR")}'
    fi
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15

    结语

    • 让info5进程随docker启动
    • 用oninitdb命令替代oninit命令
    • 实现了gbase8s集群的可扩展功能: 1、任意节点宕机后,集群正常有主节点对外提供服务; 2、宕机节点再拉起,自动加入集群,集群自动恢复。
    • 遗留问题: 1、这里写了3个节点的集群,有些命名写死了。 2、对于数据库损坏严重需要重新备份恢复的情况没有处理。3、如有需要再做更新。

    再次升级程序

    config文件完成

    [
        {
            "servername": "gbase001",
            "ip": "172.20.97.4",
            "port": "16060"
        },
        {
            "servername": "gbase002",
            "ip": "172.20.97.4",
            "port": "26060"
        },
        {
            "servername": "gbase003",
            "ip": "172.20.97.4",
            "port": "36060"
        }
    ]
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18

    info5程序加入.bash_profile,确保运行

    chmod 755 info5wangwei
    chmod 755 oninitdb
    
    docker cp info5 gbase001:/home/gbase/
    docker cp info5 gbase002:/home/gbase/
    docker cp info5 gbase003:/home/gbase/
    
    docker cp oninitdb gbase001:/home/gbase/
    docker cp oninitdb gbase002:/home/gbase/
    docker cp oninitdb gbase003:/home/gbase/
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    追加 profile
    n=`ps -ef |grep info5wangwei |grep -v grep |wc -l` 
    if [ $n == 0 ]; then
    	nohup /home/gbase/info5wangwei &
    fi
    echo ""
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6

    打包程序执行文件

    D:\01.GBase\2022GBASE工作记录\01.项目记录\17.南京磐能的测试,docker上搭建集群ER\ClusertDB源码

    qi’d

    集群docker列表

    docker run -p 16060:6060 -p 19088:19088 --name gbase001 -itd liaosnet/gbase8s /bin/bash
    docker run -p 26060:6060 -p 29088:29088 --name gbase002 -itd liaosnet/gbase8s /bin/bash
    docker run -p 36060:6060 -p 39088:39088 --name gbase003 -itd liaosnet/gbase8s /bin/bash
    docker run -p 19099:19099 --name cm1 -itd liaosnet/gbase8s /bin/bash
    docker run -p 29099:29099 --name cm2 -itd liaosnet/gbase8s /bin/bash
    
    说明: 
    	1、  .6060 端口用作集群信息通信,和集群管理命令发布。
        2、  .9088 端口用作数据库监听
        3、  .9099 端口用作连接管理器,业务可以通过这两个端口可以自动连接到数据库集群的主节点。(数据库主节点代理端口)
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11

    正常情况下启动docker的方法

    说明: 这里所说的正常情况是指,集群只少还有一个主节点一个CM节点是活的在对外提供业务服务。

    主机172.20.97.3

    #启动数据库节点容器gbase001
    docker start gbase001
    docker exec gbase001 su - gbasedbt -c "oninitdb"
    #启动数据库节点容器gbase002
    docker start gbase002
    docker exec gbase002 su - gbasedbt -c "oninitdb"
    
    #启动CM1节点
    docker start cm1
    docker exec cm1 su - gbasedbt -c "oncmsm -c /opt/gbase/etc/cm1.cfg"
    
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12

    主机172.20.97.4

    #启动数据库节点容器gbase003
    docker start gbase003
    docker exec gbase003 su - gbasedbt -c "oninitdb"
    
    #启动CM2节点
    docker start cm2
    docker exec cm2 su - gbasedbt -c "oncmsm -c /opt/gbase/etc/cm2.cfg"
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8

    启动docker的方法(集群里面的第一个节点)

    docker start gbase003
    docker exec gbase003 su - gbasedbt -c "oninit -v"
    
    
    • 1
    • 2
    • 3

    启动docker的方法

    docker start gbase002
    docker exec gbase002 su - gbasedbt -c "oninitdb"
    
    docker start gbase001
    docker exec gbase001 su - gbasedbt -c "oninitdb"
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6

    扩容

    DATADIR=/opt/gbase/data
    
    touch ${DATADIR}/plogdbs
    touch ${DATADIR}/llogdbs
    touch ${DATADIR}/tmpdbs01
    touch ${DATADIR}/tmpdbs02
    touch ${DATADIR}/tmpdbs03
    touch ${DATADIR}/tmpdbs04
    touch ${DATADIR}/datachk02
    
    chmod 660 ${DATADIR}/plogdbs
    chmod 660 ${DATADIR}/llogdbs
    chmod 660 ${DATADIR}/tmpdbs01
    chmod 660 ${DATADIR}/tmpdbs02
    chmod 660 ${DATADIR}/tmpdbs03
    chmod 660 ${DATADIR}/tmpdbs04
    chmod 660 ${DATADIR}/datachk02
    
    onspaces -c -d plogdbs   -p ${DATADIR}/plogdbs   -o 0 -s 512000
    onspaces -c -d llogdbs   -p ${DATADIR}/llogdbs   -o 0 -s 10240000
    onspaces -c -d tmpdbs01  -p ${DATADIR}/tmpdbs01  -o 0 -s 1024000 -k 8 -t
    onspaces -c -d tmpdbs02  -p ${DATADIR}/tmpdbs02  -o 0 -s 1024000 -k 8 -t
    onspaces -c -d tmpdbs03  -p ${DATADIR}/tmpdbs03  -o 0 -s 1024000 -k 8 -t
    onspaces -c -d tmpdbs04  -p ${DATADIR}/tmpdbs04  -o 0 -s 1024000 -k 8 -t
    onspaces -c -d datadbs02 -p ${DATADIR}/datachk02 -o 0 -s 1024000 -k 8
    
    for i in `seq 100`;do onparams -a -d llogdbs -s 100000;done
    for i in `seq 7`;do onmode -l;done
    onmode -c
    for i in `seq 6`;do onparams -d -l $i -y;done
    
    onparams -p -d plogdbs -s 500000 -y
    
    
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35

    crontab -e 容器没有,把这个后期加入oninitdb

    cat > ~/useinfo5.sh

    #!/bin/bash
    n=`ps -ef |grep info5wangwei |grep -v grep |wc -l` 
    if [ $n == 0 ]; then
    	nohup /home/gbase/info5wangwei &
    fi
    echo ""
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7

    chmod 755 ~/useinfo5.sh

    crontab -e

    * * * * * docker exec gbase001 su - gbasedbt -c "/home/gbase/useinfo5.sh“
    
    
    • 1
    • 2
  • 相关阅读:
    IntelliJ IDEA下基于Scala实现的Git检查工具
    深圳大学计软《程序设计基础》实验四 选择结构
    禅道bug统计并发送钉钉通知
    传统企业数字化转型,处理好业务问题是关键
    Springboot实现ENC加密
    存在负权边的单源最短路径的原理和C++实现
    数据可视化之百变柱状图
    【小米技术分享】面试题:什么是乐观锁?你是如何设计一个乐观锁
    「面经分享」西北大学 | 字节 生活服务 | 一面二面三面 HR 面
    WRF-Chem在大气环境(PM2.5、臭氧)、能见度、城市化方面应用
  • 原文地址:https://blog.csdn.net/m0_46426259/article/details/126398635