UDF函数:求两个字符串列表或者数组之间的交集,差集,并集,补集

求两个字符串列表或者数组之间的交集,差集,并集,补集。
import org.apache.commons.collections.CollectionUtils;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;

import java.util.Arrays;
import java.util.HashSet;
import java.util.List;

/**
 * @param
 *        格式二:求两个字符串列表的交集、补集、差集、并集
 *        输出两个字符串中相同的元素个数、元素值、元素值的长度
 *        如执行:
 *        supportArr(1,'.','1.2.3.4.5.6','3.4.5.6.7',1),则本udf输出:4
 *        传入五个参数:
 *        参数一:大类型:1,交集;2,补集;3,差集;4,并集
 *        参数二:字符串中的分隔符,
 *        参数三:字符串一:里面是以分隔符连接的字符串
 *        参数四:字符串二:里面是以分隔符连接的字符串
 *        参数五:返回类型:1,元素个数;2,元素的字符串值;3,元素的字符串的长度
 *
 *       //废弃 输出两个字符串中相同的元素个数,如执行:
 *       //废弃 supportArr(',','1,2,3,4,5,6','3,4,5,6,7'),则本udf输出:4
 *       //废弃 传入三个参数:
 *       //废弃 参数一:字符串中的分隔符,
 *       //废弃 参数二:字符串一:里面是以分隔符连接的字符串
 *       //废弃 参数三:字符串二:里面是以分隔符连接的字符串
 *
 */
public class UDFSupportArr extends UDF{
    String[] str1 = null;
    HashSet<String> hset1 = null;

    StringBuilder sb = new StringBuilder();
    List<String> l1;
    List<String> l2;
    List<String> l3;

    Text result = new Text();

    // 废弃
//    public Text evaluate(Text sep, Text arr1, Text arr2)
//    {
//        if (arr1 == null || arr2 == null)
//        {
//            return null;
//        }
//
//        str1 = (arr1.toString() + sep.toString() + arr2.toString()).split(sep.toString());
//
//        hset1 = new HashSet<String>(Arrays.asList(str1));
//
//        result.set(String.valueOf(str1.length - hset1.size()));
//
//        return result;
//    }

    public Text evaluate(IntWritable btype, Text tsep, Text arr1, Text arr2, IntWritable type)
    {
        if (arr1 == null || arr2 == null)
        {
            return null;
        }

        if (btype.get() == 1)
        {
            // 交集
            return intersection(tsep.toString(), arr1.toString(), arr2.toString(), type.get());
        } else if (btype.get() == 2)
        {
            // 补集
            return disjunction(tsep.toString(), arr1.toString(), arr2.toString(), type.get());
        } else if (btype.get() == 3)
        {
            // 差集
            return subtract(tsep.toString(), arr1.toString(), arr2.toString(), type.get());
        } else if (btype.get() == 4)
        {
            // 并集:即union(去重),不是union all(不去重)
            return union(tsep.toString(), arr1.toString(), arr2.toString(), type.get());
        }

        return null;
    }

    // 交集
    @SuppressWarnings("unchecked")
    public Text intersection(String tsep, String arr1, String arr2, int type)
    {
        l1 = Arrays.asList(arr1.split(tsep));
        l2 = Arrays.asList(arr2.split(tsep));

        l3 = (List<String>) CollectionUtils.intersection(l1, l2);

        // 返回交集的个数
        if (type == 1)
        {
            sb.setLength(0);
            sb.append(l3.size());
        }
        // 返回交集的字符串
        else if (type == 2)
        {
            sb.setLength(0);
            for (String str : l3)
            {
                sb.append(str + tsep);
            }
            if (sb.length() != 0)
                sb.deleteCharAt(sb.length() - 1);
        }
        // 返回交集的字符串的长度
        else if (type == 3)
        {
            sb.setLength(0);
            for (String str : l3)
            {
                sb.append(str.length() + tsep);
            }
            if (sb.length() != 0)
                sb.delete(sb.length() - tsep.length(), sb.length());
        } else
        {
            return null;
        }
        result.set(sb.toString());
        sb.setLength(0);
        return result;
    }

    // 补集
    @SuppressWarnings("unchecked")
    public Text disjunction(String tsep, String arr1, String arr2, int type)
    {
        l1 = Arrays.asList(arr1.split(tsep));
        l2 = Arrays.asList(arr2.split(tsep));

        l3 = (List<String>) CollectionUtils.disjunction(l1, l2);

        // 返回补集的个数
        if (type == 1)
        {
            sb.setLength(0);
            sb.append(l3.size());
        }
        // 返回补集的字符串
        else if (type == 2)
        {
            sb.setLength(0);
            for (String str : l3)
            {
                if (str.length() > 0)
                    sb.append(str + tsep);
            }
            sb.deleteCharAt(sb.length() - 1);
        }
        // 返回补集的字符串的长度
        else if (type == 3)
        {
            sb.setLength(0);
            for (String str : l3)
            {
                if (str.length() > 0)
                    sb.append(str.length() + tsep);
            }
            sb.delete(sb.length() - tsep.length(), sb.length());
        } else
        {
            return null;
        }
        result.set(sb.toString());
        sb.setLength(0);
        return result;
    }

    // 差集
    @SuppressWarnings("unchecked")
    public Text subtract(String tsep, String arr1, String arr2, int type)
    {
        l1 = Arrays.asList(arr1.split(tsep));
        l2 = Arrays.asList(arr2.split(tsep));

        l3 = (List<String>) CollectionUtils.subtract(l1, l2);

        // 返回差集的个数
        if (type == 1)
        {
            sb.setLength(0);
            sb.append(l3.size());
        }
        // 返回差集的字符串
        else if (type == 2)
        {
            sb.setLength(0);
            for (String str : l3)
            {
                sb.append(str + tsep);
            }
            if (sb.length() > 0)
                sb.delete(sb.length() - tsep.length(), sb.length());
        }
        // 返回差集的字符串的长度
        else if (type == 3)
        {
            sb.setLength(0);
            for (String str : l3)
            {
                sb.append(str.length() + tsep);
            }
            sb.delete(sb.length() - tsep.length(), sb.length());
        } else
        {
            return null;
        }
        result.set(sb.toString());
        sb.setLength(0);
        return result;
    }

    // 并集:即union(去重),不是union all(不去重)
    @SuppressWarnings("unchecked")
    public Text union(String tsep, String arr1, String arr2, int type)
    {
        l1 = Arrays.asList(arr1.split(tsep));
        l2 = Arrays.asList(arr2.split(tsep));

        l3 = (List<String>) CollectionUtils.union(l1, l2);

        // 返回并集的个数
        if (type == 1)
        {
            sb.setLength(0);
            sb.append(l3.size());
        }
        // 返回并集的字符串
        else if (type == 2)
        {
            sb.setLength(0);
            for (String str : l3)
            {
                if (str.length() > 0)
                    sb.append(str + tsep);
            }
            sb.deleteCharAt(sb.length() - 1);
        }
        // 返回并集的字符串的长度
        else if (type == 3)
        {
            sb.setLength(0);
            for (String str : l3)
            {
                if (str.length() > 0)
                    sb.append(str.length() + tsep);
            }
            sb.delete(sb.length() - tsep.length(), sb.length());
        } else
        {
            return null;
        }
        result.set(sb.toString());
        sb.setLength(0);
        return result;
    }

    public static void main(String[] args)
    {
        UDFSupportArr a = new UDFSupportArr();
        Text sep = new Text(",");
        Text v1 = new Text("a,b,c,d");
        Text v2 = new Text("a,b,e,f");
        //废弃 System.out.println("废弃-个数:" + a.evaluate(sep, v1, v2));
        System.out.println();
        System.out.println("交集-个数:" + a.evaluate(new IntWritable(1), sep, v1, v2, new IntWritable(1)));
        System.out.println("交集-字符:" + a.evaluate(new IntWritable(1), sep, v1, v2, new IntWritable(2)));
        System.out.println("交集-长度:" + a.evaluate(new IntWritable(1), sep, v1, v2, new IntWritable(3)));
        System.out.println();
        System.out.println("补集-个数:" + a.evaluate(new IntWritable(2), sep, v1, v2, new IntWritable(1)));
        System.out.println("补集-字符:" + a.evaluate(new IntWritable(2), sep, v1, v2, new IntWritable(2)));
        System.out.println("补集-长度:" + a.evaluate(new IntWritable(2), sep, v1, v2, new IntWritable(3)));
        System.out.println();
        System.out.println("差集-个数:" + a.evaluate(new IntWritable(3), sep, v1, v2, new IntWritable(1)));
        System.out.println("差集-字符:" + a.evaluate(new IntWritable(3), sep, v1, v2, new IntWritable(2)));
        System.out.println("差集-长度:" + a.evaluate(new IntWritable(3), sep, v1, v2, new IntWritable(3)));
        System.out.println();
        System.out.println("并集-个数:" + a.evaluate(new IntWritable(4), sep, v1, v2, new IntWritable(1)));
        System.out.println("并集-字符:" + a.evaluate(new IntWritable(4), sep, v1, v2, new IntWritable(2)));
        System.out.println("并集-长度:" + a.evaluate(new IntWritable(4), sep, v1, v2, new IntWritable(3)));

    }
}
测试数据结果如下:


交集-个数:2
交集-字符:a,b
交集-长度:1,1


补集-个数:4
补集-字符:c,d,e,f
补集-长度:1,1,1,1


差集-个数:2
差集-字符:c,d
差集-长度:1,1


并集-个数:6
并集-字符:a,b,c,d,e,f
并集-长度:1,1,1,1,1,1



猜你喜欢

转载自blog.csdn.net/qq_26442553/article/details/80459658