April 07, 2017

MapReduce Notes

Hadoop is an open-source implementation of Google’s MapReduce. Hadoop presents MapReduce as an analytics engine and, “under the hood,” employs the Hadoop Distributed File System (HDFS).

MapReduce:

  • Definition: Programming model for processing large data sets with a parallel, distributed algorithm on a cluster
  • Map: Extract something you care about
  • Group by key: Sort and Shuffle
  • Reduce: Aggregate, summarize, filter or transform

Hadoop Streaming Job Flow: Wordcount

cat input | mapper_program | sort | reducer_program > output

A Mapper Program (wordcountMapper.java)

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.StringTokenizer;
public class wordcountMapper{
    public static void main(String args[]){
        try {
            BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
            String input;
            // While we have input on stdin
            while((input = br.readline()) != null){
                StringTokenizer tokenizer = new StringTokenizer(input);
                while (tokenizer.hasMoreTokens()){
                    String word = tokenizer.nextToken();
                    System.out.println(word + "\t" + "1");
                }
            } catch(IOException io){
                io.printStackTrace();
            }
        }
    }
}

Sort the file:

cat input.txt | java wordcountMapper | sort

A Reducer Program (wordcountReducer.java)

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.StringTokenizer;
public class wordcountReducer {
    public static void main(String args[]){
        try {
            BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
            String input;
            String word = null;
            String currentWord = null;
            int currentCount = 0;
            
            while ((input = br.readLine()) != null) {
                try {
                    String[] parts = input.split("\t");
                    word = parts[0];
                    int count = Integer.parseInt(parts[1]);
                    
                    // We have sorted input, so check if we have the same word
                    if (currentWord != null && currentWord.equals(word)){
                        currentCount++;
                    } else { // the word has changed
                        if (currentWord != null) {
                            System.out.println(currentWord + "\t" + currentCount);
                        }
                        currentWord = word;
                        currentCount = count;
                    }
                } catch (NumberFormatException e) {
                    continue;
                }
            }
            
            if (currentWord != null && currentWord.equals(word)) {
                System.out.println(currentWord + "\t" + currentCount);
            }
        } catch(IOException io) {
            io.printStackTrace();
        }
    }
}
comments powered by Disqus