Hive UDF array_struct_sort 对Array＜Struct＞进行排序

一、UDF说明

array_struct_sort(array(struct1,struct2,...), string sortField)：Returns the passed array struct, ordered by the given field.
对所给的Array按sortField字段进行排序并返回。

二、代码


package com.scb.dss.udf;
 
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde.Constants;
import org.apache.hadoop.hive.serde2.objectinspector.*;
import org.apache.hadoop.io.Text;
 
import java.util.*;
 
import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category.LIST;
 
@Description(name = "array_struct_sort",
        value = "_FUNC_(array(struct1,struct2,...), string sortField) - "
                + "Returns the passed array struct, ordered by the given field",
        extended = "Example:\n"
                + "  > SELECT class, array_struct_sort(collect_list(struct_t), 'age') as struct_array\n" +
                "    FROM (\n" +
                "        SELECT '1' as class, named_struct('name', 'N003', 'age', '20') as struct_t\n" +
                "        union all \n" +
                "        SELECT '2' as class, named_struct('name', 'N001', 'age', '18') as struct_t\n" +
                "        union all \n" +
                "        SELECT '1' as class, named_struct('name', 'N002', 'age', '19') as struct_t\n" +
                "        union all\n" +
                "        SELECT '2' as class, named_struct('name', 'N000', 'age', '17') as struct_t\n" +
                "    ) as test_data\n" +
                "    group by class;\n")
public class UDFArrayStructSort extends GenericUDF {
    protected ObjectInspector[] argumentOIs;
 
    ListObjectInspector loi;
    StructObjectInspector elOi;
 
    // cache comparators for performance
    Map comparatorCache = new HashMap();
 
    @Override
    public ObjectInspector initialize(ObjectInspector[] ois) throws UDFArgumentException {
        // all common initialization
        argumentOIs = ois;
 
        // clear comparator cache from previous invokations
        comparatorCache.clear();
 
        return checkAndReadObjectInspectors(ois);
    }
 
    /**
     * Utility method to check that an object inspector is of the correct type,
     * and returns its element object inspector
     *
     * @param ois
     * @return
     * @throws UDFArgumentTypeException
     */
    protected ListObjectInspector checkAndReadObjectInspectors(ObjectInspector[] ois)
            throws UDFArgumentTypeException, UDFArgumentException {
        // check number of arguments. We only accept two,
        // the list of struct to sort and the name of the struct field
        // to sort by
        if (ois.length != 2) {
            throw new UDFArgumentException("2 arguments needed, found " + ois.length);
        }
 
        // first argument must be a list/array
        if (!ois[0].getCategory().equals(LIST)) {
            throw new UDFArgumentTypeException(0, "Argument 1"
                    + " of function " + this.getClass().getCanonicalName() + " must be " + Constants.LIST_TYPE_NAME
                    + ", but " + ois[0].getTypeName()
                    + " was found.");
        }
 
        // a list/array is read by a LIST object inspector
        loi = (ListObjectInspector) ois[0];
 
        // a list has an element type associated to it
        // elements must be structs for this UDF
        if (loi.getListElementObjectInspector().getCategory() != ObjectInspector.Category.STRUCT) {
            throw new UDFArgumentTypeException(0, "Argument 1"
                    + " of function " + this.getClass().getCanonicalName() + " must be an array of structs " +
                    " but is an array of " + loi.getListElementObjectInspector().getCategory().name());
        }
 
        // store the object inspector for the elements
        elOi = (StructObjectInspector) loi.getListElementObjectInspector();
 
        // returns the same object inspector
        return loi;
    }
 
    // factory method for cached comparators
    Comparator getComparator(Text field) {
        if (!comparatorCache.containsKey(field.toString())) {
            comparatorCache.put(field.toString(), new StructFieldComparator(field.toString()));
        }
        return comparatorCache.get(field.toString());
    }
 
    @Override
    public Object evaluate(DeferredObject[] dos) throws HiveException {
        // get list
        if (dos == null || dos.length != 2) {
            throw new HiveException("received " + (dos == null ? "null" :
                    Integer.toString(dos.length) + " elements instead of 2"));
        }
 
        // each object is supposed to be a struct
        // we make a shallow copy of the list. We don't want to sort
        // the list in place since the object could be used elsewhere in the
        // hive query
        ArrayList al = new ArrayList(loi.getList(dos[0].get()));
 
        // sort with our comparator, then return
        // note that we could get a different field to sort by for every
        // invocation
        Collections.sort(al, getComparator((Text) dos[1].get()));
 
        return al;
    }
 
    @Override
    public String getDisplayString(String[] children) {
        return (children == null ? null : this.getClass().getCanonicalName() + "(" + children[0] + "," + children[1] + ")");
    }
 
    // to sort a list , we must supply our comparator
    public class StructFieldComparator implements Comparator {
        StructField field;
 
        public StructFieldComparator(String fieldName) {
            field = elOi.getStructFieldRef(fieldName);
        }
 
        public int compare(Object o1, Object o2) {
 
            // ok..so both not null
            Object f1 = elOi.getStructFieldData(o1, field);
            Object f2 = elOi.getStructFieldData(o2, field);
            // compare using hive's utility functions
            return ObjectInspectorUtils.compare(f1, field.getFieldObjectInspector(),
                    f2, field.getFieldObjectInspector());
        }
    }
 
}

三、测试

测试数据如下：

class	struct
1	{"name":"N003","age":"20"}
2	{"name":"N001","age":"18"}
1	{"name":"N002","age":"19"}
2	{"name":"N000","age":"17"}

测试代码：


SELECT class, array_struct_sort(collect_list(struct_t), 'age') as struct_array
    FROM (
        SELECT '1' as class, named_struct('name', 'N003', 'age', '20') as struct_t
        union all 
        SELECT '2' as class, named_struct('name', 'N001', 'age', '18') as struct_t
        union all 
        SELECT '1' as class, named_struct('name', 'N002', 'age', '19') as struct_t
        union all
        SELECT '2' as class, named_struct('name', 'N000', 'age', '17') as struct_t
    ) as test_data
    group by class;

测试结果如下：

在结合上节的Hive UDAF collect_map我们就可以对MAP>>进行聚合排序操作了。


SELECT class, collect_map(class, struct_array) as res
FROM (
    SELECT class, array_struct_sort(collect_list(struct_t), 'age') as struct_array
    FROM (
        SELECT '1' as class, named_struct('name', 'N003', 'age', '20') as struct_t
        union all 
        SELECT '2' as class, named_struct('name', 'N001', 'age', '18') as struct_t
        union all 
        SELECT '1' as class, named_struct('name', 'N002', 'age', '19') as struct_t
        union all
        SELECT '2' as class, named_struct('name', 'N000', 'age', '17') as struct_t
    ) as test_data
    group by class
) as tmp
group by class
;

跟上节的结果相比，这次出来的struct可以按照age进行排序了。

res字段类型为MAP>>，如果要取一班年纪最新的学生名字，代码如下：


select res['1'][0].name
from (
SELECT class, collect_map(class, struct_array) as res
FROM (
    SELECT class, array_struct_sort(collect_list(struct_t), 'age') as struct_array
    FROM (
        SELECT '1' as class, named_struct('name', 'N003', 'age', '20') as struct_t
        union all 
        SELECT '2' as class, named_struct('name', 'N001', 'age', '18') as struct_t
        union all 
        SELECT '1' as class, named_struct('name', 'N002', 'age', '19') as struct_t
        union all
        SELECT '2' as class, named_struct('name', 'N000', 'age', '17') as struct_t
    ) as test_data
    group by class
) as tmp
group by class
) as t
;

可以在加个where条件去过滤NULL

四、参考文档

» Structured data in Hive: a generic UDF to sort arrays of structs Roberto Congiu's blog

五、待改进

目前UDF只支持升序排序，后续可以在加个第三个参数来控制升序、降序。只需要在实现一个StructFieldComparator。

相关阅读:
C++20之Module（浅析）
SQL 语法速成手册
 IB和A-level物理怎么选？
海康威视Java实习面试
 js实现拖动效果
 酷克数据发布HD-SQL-LLaMA模型，开启数据分析“人人可及”新时代
 维格云小程序如何快速上手开发？
【C++入门篇】保姆级教程篇【上】
npm 全局配置
 【外设】拓展坞接入外设一直弹窗报错问题
原文地址：https://blog.csdn.net/qq_37771475/article/details/126371687