Spark implements inverted index

[color = green] [/ color] package sparkTest.rdd;

import java.util.ArrayList;
import java.util.List;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.PairFunction;

import scala.Tuple2;

/ **
* @author yuzhijun
*
f 排 索引
程序 (f /files/InvertedIndex.txt 内容) :
url01: key01, key02, key03
url02: key01, key04, key05, key06
url03: key01, key02, key03
url key02, key03, key04, key05
url05:key02,key03,key04,key05
输出:
(key01,[url01, url02, url03, url04])
(key02,[url01, url03, url04, url05])
(key03,[url01, url03, url04, url05])
(key04,[url02, url04, url05])
(key05,[url02, url04, url05])
(key06,[url02])
*
*/

/**
* 倒排索引 InvertedIndex
*/
public class InvertedIndex {

public static void main(String[] args) {

SparkConf conf = (new SparkConf()).setAppName("InvertedIndex").setMaster("local[1]");
    @SuppressWarnings("resource")
JavaSparkContext sc = new JavaSparkContext(conf); //创建spark上下文
    JavaRDD<String> file = sc.textFile(System.getProperty("user.dir")+"/files/InvertedIndex.txt"); //Load file
   
    /*
     * Generate (url address, keyword list) data format
     * /
    JavaPairRDD<String, String> urlKeys = file.mapToPair(new PairFunction<String,String,String>(){
private static final long serialVersionUID = 1L;
public Tuple2<String,String> call(String line){
    String[] arr = line.split(":");
    String key = arr[0]; //url address
    String value = arr[1]; //keyword list
    return new Tuple2<String,String>(key,value);
    }
    });
   
    /*
     * Use the keyword as key and URL as value to generate (keyword, URL) data format
     */
    JavaRDD<Tuple2<String,String>> keyUrl = urlKeys.flatMap(new FlatMapFunction<Tuple2<String,String>,Tuple2<String,String>>(){
private static final long serialVersionUID = 1L;
public Iterable<Tuple2<String, String>> call(Tuple2<String,String> t){
    String url = t._1;
    String keys = t._2; //关键字
    String[] keyArr = keys.split(",");
    List<Tuple2<String,String>> keyUrlList = new ArrayList<Tuple2<String,String>>();
    for(String key : keyArr){ //循环遍历每个url对应的关键字
    keyUrlList.add(new Tuple2<String,String>(key,url));
    }
    return keyUrlList;
    }
    });      1 use keyword as key and url as value
   
    /*

     2 Summarize groupby
     according to key 3 Sort according to key
     4 Save the result to file
     */
    keyUrl.mapToPair(new PairFunction<Tuple2<String,String>,String,String>(){
private static final long serialVersionUID = 1L;
public Tuple2 <String,String> call(Tuple2<String,String> t){
    return new Tuple2<String,String>(t._1,t._2 ); //(keyword,url)
    }
    }).groupByKey(). sortByKey().saveAsTextFile(System.getProperty("user.dir")+"/files/SparkRddTest01/InvertedIndex");
}

}

Guess you like

Origin http://10.200.1.11:23101/article/api/json?id=326611818&siteId=291194637