array_struct_sort(array(struct1,struct2,...), string sortField):Returns the passed array struct, ordered by the given field. 对所给的Array按sortField字段进行排序并返回。
- package com.scb.dss.udf;
-
- import org.apache.hadoop.hive.ql.exec.Description;
- import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
- import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException;
- import org.apache.hadoop.hive.ql.metadata.HiveException;
- import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
- import org.apache.hadoop.hive.serde.Constants;
- import org.apache.hadoop.hive.serde2.objectinspector.*;
- import org.apache.hadoop.io.Text;
-
- import java.util.*;
-
- import static org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category.LIST;
-
- @Description(name = "array_struct_sort",
- value = "_FUNC_(array(struct1,struct2,...), string sortField) - "
- + "Returns the passed array struct, ordered by the given field",
- extended = "Example:\n"
- + " > SELECT class, array_struct_sort(collect_list(struct_t), 'age') as struct_array\n" +
- " FROM (\n" +
- " SELECT '1' as class, named_struct('name', 'N003', 'age', '20') as struct_t\n" +
- " union all \n" +
- " SELECT '2' as class, named_struct('name', 'N001', 'age', '18') as struct_t\n" +
- " union all \n" +
- " SELECT '1' as class, named_struct('name', 'N002', 'age', '19') as struct_t\n" +
- " union all\n" +
- " SELECT '2' as class, named_struct('name', 'N000', 'age', '17') as struct_t\n" +
- " ) as test_data\n" +
- " group by class;\n")
- public class UDFArrayStructSort extends GenericUDF {
- protected ObjectInspector[] argumentOIs;
-
- ListObjectInspector loi;
- StructObjectInspector elOi;
-
- // cache comparators for performance
- Map
comparatorCache = new HashMap(); -
- @Override
- public ObjectInspector initialize(ObjectInspector[] ois) throws UDFArgumentException {
- // all common initialization
- argumentOIs = ois;
-
- // clear comparator cache from previous invokations
- comparatorCache.clear();
-
- return checkAndReadObjectInspectors(ois);
- }
-
- /**
- * Utility method to check that an object inspector is of the correct type,
- * and returns its element object inspector
- *
- * @param ois
- * @return
- * @throws UDFArgumentTypeException
- */
- protected ListObjectInspector checkAndReadObjectInspectors(ObjectInspector[] ois)
- throws UDFArgumentTypeException, UDFArgumentException {
- // check number of arguments. We only accept two,
- // the list of struct to sort and the name of the struct field
- // to sort by
- if (ois.length != 2) {
- throw new UDFArgumentException("2 arguments needed, found " + ois.length);
- }
-
- // first argument must be a list/array
- if (!ois[0].getCategory().equals(LIST)) {
- throw new UDFArgumentTypeException(0, "Argument 1"
- + " of function " + this.getClass().getCanonicalName() + " must be " + Constants.LIST_TYPE_NAME
- + ", but " + ois[0].getTypeName()
- + " was found.");
- }
-
- // a list/array is read by a LIST object inspector
- loi = (ListObjectInspector) ois[0];
-
- // a list has an element type associated to it
- // elements must be structs for this UDF
- if (loi.getListElementObjectInspector().getCategory() != ObjectInspector.Category.STRUCT) {
- throw new UDFArgumentTypeException(0, "Argument 1"
- + " of function " + this.getClass().getCanonicalName() + " must be an array of structs " +
- " but is an array of " + loi.getListElementObjectInspector().getCategory().name());
- }
-
- // store the object inspector for the elements
- elOi = (StructObjectInspector) loi.getListElementObjectInspector();
-
- // returns the same object inspector
- return loi;
- }
-
- // factory method for cached comparators
- Comparator getComparator(Text field) {
- if (!comparatorCache.containsKey(field.toString())) {
- comparatorCache.put(field.toString(), new StructFieldComparator(field.toString()));
- }
- return comparatorCache.get(field.toString());
- }
-
- @Override
- public Object evaluate(DeferredObject[] dos) throws HiveException {
- // get list
- if (dos == null || dos.length != 2) {
- throw new HiveException("received " + (dos == null ? "null" :
- Integer.toString(dos.length) + " elements instead of 2"));
- }
-
- // each object is supposed to be a struct
- // we make a shallow copy of the list. We don't want to sort
- // the list in place since the object could be used elsewhere in the
- // hive query
- ArrayList al = new ArrayList(loi.getList(dos[0].get()));
-
- // sort with our comparator, then return
- // note that we could get a different field to sort by for every
- // invocation
- Collections.sort(al, getComparator((Text) dos[1].get()));
-
- return al;
- }
-
- @Override
- public String getDisplayString(String[] children) {
- return (children == null ? null : this.getClass().getCanonicalName() + "(" + children[0] + "," + children[1] + ")");
- }
-
- // to sort a list , we must supply our comparator
- public class StructFieldComparator implements Comparator {
- StructField field;
-
- public StructFieldComparator(String fieldName) {
- field = elOi.getStructFieldRef(fieldName);
- }
-
- public int compare(Object o1, Object o2) {
-
- // ok..so both not null
- Object f1 = elOi.getStructFieldData(o1, field);
- Object f2 = elOi.getStructFieldData(o2, field);
- // compare using hive's utility functions
- return ObjectInspectorUtils.compare(f1, field.getFieldObjectInspector(),
- f2, field.getFieldObjectInspector());
- }
- }
-
- }
测试数据如下:
| class | struct |
| 1 | {"name":"N003","age":"20"} |
| 2 | {"name":"N001","age":"18"} |
| 1 | {"name":"N002","age":"19"} |
| 2 | {"name":"N000","age":"17"} |
测试代码:
- SELECT class, array_struct_sort(collect_list(struct_t), 'age') as struct_array
- FROM (
- SELECT '1' as class, named_struct('name', 'N003', 'age', '20') as struct_t
- union all
- SELECT '2' as class, named_struct('name', 'N001', 'age', '18') as struct_t
- union all
- SELECT '1' as class, named_struct('name', 'N002', 'age', '19') as struct_t
- union all
- SELECT '2' as class, named_struct('name', 'N000', 'age', '17') as struct_t
- ) as test_data
- group by class;
测试结果如下:
![]()
在结合上节的Hive UDAF collect_map我们就可以对MAP
- SELECT class, collect_map(class, struct_array) as res
- FROM (
- SELECT class, array_struct_sort(collect_list(struct_t), 'age') as struct_array
- FROM (
- SELECT '1' as class, named_struct('name', 'N003', 'age', '20') as struct_t
- union all
- SELECT '2' as class, named_struct('name', 'N001', 'age', '18') as struct_t
- union all
- SELECT '1' as class, named_struct('name', 'N002', 'age', '19') as struct_t
- union all
- SELECT '2' as class, named_struct('name', 'N000', 'age', '17') as struct_t
- ) as test_data
- group by class
- ) as tmp
- group by class
- ;
![]()
跟上节的结果相比,这次出来的struct可以按照age进行排序了。
res字段类型为MAP
- select res['1'][0].name
- from (
- SELECT class, collect_map(class, struct_array) as res
- FROM (
- SELECT class, array_struct_sort(collect_list(struct_t), 'age') as struct_array
- FROM (
- SELECT '1' as class, named_struct('name', 'N003', 'age', '20') as struct_t
- union all
- SELECT '2' as class, named_struct('name', 'N001', 'age', '18') as struct_t
- union all
- SELECT '1' as class, named_struct('name', 'N002', 'age', '19') as struct_t
- union all
- SELECT '2' as class, named_struct('name', 'N000', 'age', '17') as struct_t
- ) as test_data
- group by class
- ) as tmp
- group by class
- ) as t
- ;
可以在加个where条件去过滤NULL
» Structured data in Hive: a generic UDF to sort arrays of structs Roberto Congiu's blog
目前UDF只支持升序排序,后续可以在加个第三个参数来控制升序、降序。只需要在实现一个StructFieldComparator。