使用Hadoop处理数据时，在Reduce阶段

玻璃心

2016-01-26

有时候，我们使用Hadoop处理数据时，在Reduce阶段，我们可能想对每一个输出的key进行单独输出一个目录或文件，这样方便数据分析，比如根据某个时间段对日志文件进行时间段归类等等。这时候我们就可以使用MultipleOutputs类，来搞定这件事，

下面，先来看下散仙的测试数据：

中国;我们  
美国;他们  
中国;123  
中国人;善良  
美国;USA  
美国;在北美洲

中国;我们
美国;他们
中国;123
中国人;善良
美国;USA
美国;在北美洲

输出结果：预期输出结果是：
中国一组，美国一组，中国人一组
核心代码如下：

package com.partition.test;  
  
import java.io.IOException;  
  
import org.apache.hadoop.fs.FileSystem;  
import org.apache.hadoop.fs.Path;  
import org.apache.hadoop.io.LongWritable;  
import org.apache.hadoop.io.Text;  
import org.apache.hadoop.mapred.JobConf;  
import org.apache.hadoop.mapreduce.Job;  
import org.apache.hadoop.mapreduce.Mapper;  
import org.apache.hadoop.mapreduce.Partitioner;  
import org.apache.hadoop.mapreduce.Reducer;  
import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;  
import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;  
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;  
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;  
  
import com.qin.operadb.PersonRecoder;  
import com.qin.operadb.ReadMapDB;  
   
/*** 
 * @author qindongliang 
 *  
 * 大数据技术交流群:324714439 
 * **/  
public class TestMultiOutput {  
      
      
    /** 
     * map任务 
     *  
     * **/  
    public static class PMapper extends Mapper<LongWritable, Text, Text, Text>{  
          
        @Override  
        protected void map(LongWritable key, Text value,Context context)  
                throws IOException, InterruptedException {  
             String ss[]=value.toString().split(";");  
            context.write(new Text(ss[0]), new Text(ss[1]));      
        }  
          
          
    }  
      
   
     public static class PReduce extends Reducer<Text, Text, Text, Text>{  
         /** 
          * 设置多个文件输出 
          * */  
         private MultipleOutputs mos;  
           
         @Override  
        protected void setup(Context context)  
                throws IOException, InterruptedException {  
              mos=new MultipleOutputs(context);//初始化mos  
        }  
         @Override  
        protected void reduce(Text arg0, Iterable<Text> arg1, Context arg2)  
                throws IOException, InterruptedException {  
               
              String key=arg0.toString();  
             for(Text t:arg1){  
                   if(key.equals("中国")){   
                       /** 
                        * 一个参数 
                        * **/  
                       mos.write("china", arg0,t);   
                   } else if(key.equals("美国")){  
                       mos.write("USA", arg0,t);      
                   } else if(key.equals("中国人")){  
                       mos.write("cperson", arg0,t);   
                         
                   }  
           
                 //System.out.println("Reduce:  "+arg0.toString()+"   "+t.toString());  
             }  
                 
               
        }  
           
         @Override  
        protected void cleanup(  
                 Context context)  
                throws IOException, InterruptedException {  
             mos.close();//释放资源  
        }  
           
     }  
       
       
     public static void main(String[] args) throws Exception{  
         JobConf conf=new JobConf(ReadMapDB.class);  
         //Configuration conf=new Configuration();  
        // conf.set("mapred.job.tracker","192.168.75.130:9001");  
        //读取person中的数据字段  
        // conf.setJar("tt.jar");  
        //注意这行代码放在最前面，进行初始化，否则会报  
       
       
        /**Job任务**/  
        Job job=new Job(conf, "testpartion");  
        job.setJarByClass(TestMultiOutput.class);  
        System.out.println("模式：  "+conf.get("mapred.job.tracker"));;  
        // job.setCombinerClass(PCombine.class);  
        //job.setPartitionerClass(PPartition.class);  
        //job.setNumReduceTasks(5);  
         job.setMapperClass(PMapper.class);  
           
         /** 
          * 注意在初始化时需要设置输出文件的名 
          * 另外名称，不支持中文名，仅支持英文字符 
          *  
          * **/  
         MultipleOutputs.addNamedOutput(job, "china", TextOutputFormat.class, Text.class, Text.class);  
         MultipleOutputs.addNamedOutput(job, "USA", TextOutputFormat.class, Text.class, Text.class);  
         MultipleOutputs.addNamedOutput(job, "cperson", TextOutputFormat.class, Text.class, Text.class);  
         job.setReducerClass(PReduce.class);  
         job.setOutputKeyClass(Text.class);  
         job.setOutputValueClass(Text.class);  
          
        String path="hdfs://192.168.75.130:9000/root/outputdb";  
        FileSystem fs=FileSystem.get(conf);  
        Path p=new Path(path);  
        if(fs.exists(p)){  
            fs.delete(p, true);  
            System.out.println("输出路径存在，已删除！");  
        }  
        FileInputFormat.setInputPaths(job, "hdfs://192.168.75.130:9000/root/input");  
        FileOutputFormat.setOutputPath(job,p );  
        System.exit(job.waitForCompletion(true) ? 0 : 1);    
           
           
    }  
      
      
  
}

package com.partition.test;

import java.io.IOException;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.db.DBConfiguration;
import org.apache.hadoop.mapreduce.lib.db.DBInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import com.qin.operadb.PersonRecoder;
import com.qin.operadb.ReadMapDB;
 
/***
 * @author qindongliang
 * 
 * 大数据技术交流群:324714439
 * **/
public class TestMultiOutput {
	
	
	/**
	 * map任务
	 * 
	 * **/
	public static class PMapper extends Mapper<LongWritable, Text, Text, Text>{
		
		@Override
		protected void map(LongWritable key, Text value,Context context)
				throws IOException, InterruptedException {
			 String ss[]=value.toString().split(";");
			context.write(new Text(ss[0]), new Text(ss[1]));	
		}
		
		
	}
	
 
	 public static class PReduce extends Reducer<Text, Text, Text, Text>{
		 /**
		  * 设置多个文件输出
		  * */
		 private MultipleOutputs mos;
		 
		 @Override
		protected void setup(Context context)
				throws IOException, InterruptedException {
			  mos=new MultipleOutputs(context);//初始化mos
		}
		 @Override
		protected void reduce(Text arg0, Iterable<Text> arg1, Context arg2)
				throws IOException, InterruptedException {
			 
			  String key=arg0.toString();
			 for(Text t:arg1){
				   if(key.equals("中国")){ 
					   /**
					    * 一个参数
					    * **/
					   mos.write("china", arg0,t); 
				   } else if(key.equals("美国")){
					   mos.write("USA", arg0,t);    
				   } else if(key.equals("中国人")){
					   mos.write("cperson", arg0,t); 
					   
				   }
	     
				 //System.out.println("Reduce:  "+arg0.toString()+"   "+t.toString());
			 }
			   
			 
		}
		 
		 @Override
		protected void cleanup(
				 Context context)
				throws IOException, InterruptedException {
			 mos.close();//释放资源
		}
		 
	 }
	 
	 
	 public static void main(String[] args) throws Exception{
		 JobConf conf=new JobConf(ReadMapDB.class);
		 //Configuration conf=new Configuration();
	  	// conf.set("mapred.job.tracker","192.168.75.130:9001");
		//读取person中的数据字段
	  	// conf.setJar("tt.jar");
		//注意这行代码放在最前面，进行初始化，否则会报
	 
	 
		/**Job任务**/
		Job job=new Job(conf, "testpartion");
		job.setJarByClass(TestMultiOutput.class);
		System.out.println("模式：  "+conf.get("mapred.job.tracker"));;
		// job.setCombinerClass(PCombine.class);
		//job.setPartitionerClass(PPartition.class);
		//job.setNumReduceTasks(5);
		 job.setMapperClass(PMapper.class);
		 
		 /**
		  * 注意在初始化时需要设置输出文件的名
		  * 另外名称，不支持中文名，仅支持英文字符
		  * 
		  * **/
		 MultipleOutputs.addNamedOutput(job, "china", TextOutputFormat.class, Text.class, Text.class);
		 MultipleOutputs.addNamedOutput(job, "USA", TextOutputFormat.class, Text.class, Text.class);
		 MultipleOutputs.addNamedOutput(job, "cperson", TextOutputFormat.class, Text.class, Text.class);
		 job.setReducerClass(PReduce.class);
		 job.setOutputKeyClass(Text.class);
		 job.setOutputValueClass(Text.class);
	    
		String path="hdfs://192.168.75.130:9000/root/outputdb";
		FileSystem fs=FileSystem.get(conf);
		Path p=new Path(path);
		if(fs.exists(p)){
			fs.delete(p, true);
			System.out.println("输出路径存在，已删除！");
		}
		FileInputFormat.setInputPaths(job, "hdfs://192.168.75.130:9000/root/input");
		FileOutputFormat.setOutputPath(job,p );
		System.exit(job.waitForCompletion(true) ? 0 : 1);  
		 
		 
	}
	
	

}

如果是中文的路径名，则会报如下的一个异常：

模式：  local  
输出路径存在，已删除！  
WARN - NativeCodeLoader.<clinit>(52) | Unable to load native-hadoop library for your platform... using builtin-java classes where applicable  
WARN - JobClient.copyAndConfigureFiles(746) | Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.  
WARN - JobClient.copyAndConfigureFiles(870) | No job jar file set.  User classes may not be found. See JobConf(Class) or JobConf#setJar(String).  
INFO - FileInputFormat.listStatus(237) | Total input paths to process : 1  
WARN - LoadSnappy.<clinit>(46) | Snappy native library not loaded  
INFO - JobClient.monitorAndPrintJob(1380) | Running job: job_local1533332464_0001  
INFO - LocalJobRunner$Job.run(340) | Waiting for map tasks  
INFO - LocalJobRunner$Job$MapTaskRunnable.run(204) | Starting task: attempt_local1533332464_0001_m_000000_0  
INFO - Task.initialize(534) |  Using ResourceCalculatorPlugin : null  
INFO - MapTask.runNewMapper(729) | Processing split: hdfs://192.168.75.130:9000/root/input/group.txt:0+91  
INFO - MapTask$MapOutputBuffer.<init>(949) | io.sort.mb = 100  
INFO - MapTask$MapOutputBuffer.<init>(961) | data buffer = 79691776/99614720  
INFO - MapTask$MapOutputBuffer.<init>(962) | record buffer = 262144/327680  
INFO - MapTask$MapOutputBuffer.flush(1289) | Starting flush of map output  
INFO - MapTask$MapOutputBuffer.sortAndSpill(1471) | Finished spill 0  
INFO - Task.done(858) | Task:attempt_local1533332464_0001_m_000000_0 is done. And is in the process of commiting  
INFO - LocalJobRunner$Job.statusUpdate(466) |   
INFO - Task.sendDone(970) | Task 'attempt_local1533332464_0001_m_000000_0' done.  
INFO - LocalJobRunner$Job$MapTaskRunnable.run(229) | Finishing task: attempt_local1533332464_0001_m_000000_0  
INFO - LocalJobRunner$Job.run(348) | Map task executor complete.  
INFO - Task.initialize(534) |  Using ResourceCalculatorPlugin : null  
INFO - LocalJobRunner$Job.statusUpdate(466) |   
INFO - Merger$MergeQueue.merge(408) | Merging 1 sorted segments  
INFO - Merger$MergeQueue.merge(491) | Down to the last merge-pass, with 1 segments left of total size: 101 bytes  
INFO - LocalJobRunner$Job.statusUpdate(466) |   
WARN - LocalJobRunner$Job.run(435) | job_local1533332464_0001  
java.lang.IllegalArgumentException: Name cannot be have a '一' char  
    at org.apache.hadoop.mapreduce.lib.output.MultipleOutputs.checkTokenName(MultipleOutputs.java:160)  
    at org.apache.hadoop.mapreduce.lib.output.MultipleOutputs.checkNamedOutputName(MultipleOutputs.java:186)  
    at org.apache.hadoop.mapreduce.lib.output.MultipleOutputs.write(MultipleOutputs.java:363)  
    at org.apache.hadoop.mapreduce.lib.output.MultipleOutputs.write(MultipleOutputs.java:348)  
    at com.partition.test.TestMultiOutput$PReduce.reduce(TestMultiOutput.java:74)  
    at com.partition.test.TestMultiOutput$PReduce.reduce(TestMultiOutput.java:1)  
    at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:177)  
    at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:649)  
    at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:418)  
    at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:398)  
INFO - JobClient.monitorAndPrintJob(1393) |  map 100% reduce 0%  
INFO - JobClient.monitorAndPrintJob(1448) | Job complete: job_local1533332464_0001  
INFO - Counters.log(585) | Counters: 17  
INFO - Counters.log(587) |   File Input Format Counters   
INFO - Counters.log(589) |     Bytes Read=91  
INFO - Counters.log(587) |   FileSystemCounters  
INFO - Counters.log(589) |     FILE_BYTES_READ=177  
INFO - Counters.log(589) |     HDFS_BYTES_READ=91  
INFO - Counters.log(589) |     FILE_BYTES_WRITTEN=71111  
INFO - Counters.log(587) |   Map-Reduce Framework  
INFO - Counters.log(589) |     Map output materialized bytes=105  
INFO - Counters.log(589) |     Map input records=6  
INFO - Counters.log(589) |     Reduce shuffle bytes=0  
INFO - Counters.log(589) |     Spilled Records=6  
INFO - Counters.log(589) |     Map output bytes=87  
INFO - Counters.log(589) |     Total committed heap usage (bytes)=227737600  
INFO - Counters.log(589) |     Combine input records=0  
INFO - Counters.log(589) |     SPLIT_RAW_BYTES=112  
INFO - Counters.log(589) |     Reduce input records=0  
INFO - Counters.log(589) |     Reduce input groups=0  
INFO - Counters.log(589) |     Combine output records=0  
INFO - Counters.log(589) |     Reduce output records=0  
INFO - Counters.log(589) |     Map output records=6

模式：  local
输出路径存在，已删除！
WARN - NativeCodeLoader.<clinit>(52) | Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
WARN - JobClient.copyAndConfigureFiles(746) | Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
WARN - JobClient.copyAndConfigureFiles(870) | No job jar file set.  User classes may not be found. See JobConf(Class) or JobConf#setJar(String).
INFO - FileInputFormat.listStatus(237) | Total input paths to process : 1
WARN - LoadSnappy.<clinit>(46) | Snappy native library not loaded
INFO - JobClient.monitorAndPrintJob(1380) | Running job: job_local1533332464_0001
INFO - LocalJobRunner$Job.run(340) | Waiting for map tasks
INFO - LocalJobRunner$Job$MapTaskRunnable.run(204) | Starting task: attempt_local1533332464_0001_m_000000_0
INFO - Task.initialize(534) |  Using ResourceCalculatorPlugin : null
INFO - MapTask.runNewMapper(729) | Processing split: hdfs://192.168.75.130:9000/root/input/group.txt:0+91
INFO - MapTask$MapOutputBuffer.<init>(949) | io.sort.mb = 100
INFO - MapTask$MapOutputBuffer.<init>(961) | data buffer = 79691776/99614720
INFO - MapTask$MapOutputBuffer.<init>(962) | record buffer = 262144/327680
INFO - MapTask$MapOutputBuffer.flush(1289) | Starting flush of map output
INFO - MapTask$MapOutputBuffer.sortAndSpill(1471) | Finished spill 0
INFO - Task.done(858) | Task:attempt_local1533332464_0001_m_000000_0 is done. And is in the process of commiting
INFO - LocalJobRunner$Job.statusUpdate(466) | 
INFO - Task.sendDone(970) | Task 'attempt_local1533332464_0001_m_000000_0' done.
INFO - LocalJobRunner$Job$MapTaskRunnable.run(229) | Finishing task: attempt_local1533332464_0001_m_000000_0
INFO - LocalJobRunner$Job.run(348) | Map task executor complete.
INFO - Task.initialize(534) |  Using ResourceCalculatorPlugin : null
INFO - LocalJobRunner$Job.statusUpdate(466) | 
INFO - Merger$MergeQueue.merge(408) | Merging 1 sorted segments
INFO - Merger$MergeQueue.merge(491) | Down to the last merge-pass, with 1 segments left of total size: 101 bytes
INFO - LocalJobRunner$Job.statusUpdate(466) | 
WARN - LocalJobRunner$Job.run(435) | job_local1533332464_0001
java.lang.IllegalArgumentException: Name cannot be have a '一' char
	at org.apache.hadoop.mapreduce.lib.output.MultipleOutputs.checkTokenName(MultipleOutputs.java:160)
	at org.apache.hadoop.mapreduce.lib.output.MultipleOutputs.checkNamedOutputName(MultipleOutputs.java:186)
	at org.apache.hadoop.mapreduce.lib.output.MultipleOutputs.write(MultipleOutputs.java:363)
	at org.apache.hadoop.mapreduce.lib.output.MultipleOutputs.write(MultipleOutputs.java:348)
	at com.partition.test.TestMultiOutput$PReduce.reduce(TestMultiOutput.java:74)
	at com.partition.test.TestMultiOutput$PReduce.reduce(TestMultiOutput.java:1)
	at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:177)
	at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:649)
	at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:418)
	at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:398)
INFO - JobClient.monitorAndPrintJob(1393) |  map 100% reduce 0%
INFO - JobClient.monitorAndPrintJob(1448) | Job complete: job_local1533332464_0001
INFO - Counters.log(585) | Counters: 17
INFO - Counters.log(587) |   File Input Format Counters 
INFO - Counters.log(589) |     Bytes Read=91
INFO - Counters.log(587) |   FileSystemCounters
INFO - Counters.log(589) |     FILE_BYTES_READ=177
INFO - Counters.log(589) |     HDFS_BYTES_READ=91
INFO - Counters.log(589) |     FILE_BYTES_WRITTEN=71111
INFO - Counters.log(587) |   Map-Reduce Framework
INFO - Counters.log(589) |     Map output materialized bytes=105
INFO - Counters.log(589) |     Map input records=6
INFO - Counters.log(589) |     Reduce shuffle bytes=0
INFO - Counters.log(589) |     Spilled Records=6
INFO - Counters.log(589) |     Map output bytes=87
INFO - Counters.log(589) |     Total committed heap usage (bytes)=227737600
INFO - Counters.log(589) |     Combine input records=0
INFO - Counters.log(589) |     SPLIT_RAW_BYTES=112
INFO - Counters.log(589) |     Reduce input records=0
INFO - Counters.log(589) |     Reduce input groups=0
INFO - Counters.log(589) |     Combine output records=0
INFO - Counters.log(589) |     Reduce output records=0
INFO - Counters.log(589) |     Map output records=6

源码中关于名称的校验如下：

/** 
  * Checks if a named output name is valid token. 
  * 
  * @param namedOutput named output Name 
  * @throws IllegalArgumentException if the output name is not valid. 
  */  
 private static void checkTokenName(String namedOutput) {  
   if (namedOutput == null || namedOutput.length() == 0) {  
     throw new IllegalArgumentException(  
       "Name cannot be NULL or emtpy");  
   }  
   for (char ch : namedOutput.toCharArray()) {  
     if ((ch >= 'A') && (ch <= 'Z')) {  
       continue;  
     }  
     if ((ch >= 'a') && (ch <= 'z')) {  
       continue;  
     }  
     if ((ch >= '0') && (ch <= '9')) {  
       continue;  
     }  
     throw new IllegalArgumentException(  
       "Name cannot be have a '" + ch + "' char");  
   }  
 }

/**
   * Checks if a named output name is valid token.
   *
   * @param namedOutput named output Name
   * @throws IllegalArgumentException if the output name is not valid.
   */
  private static void checkTokenName(String namedOutput) {
    if (namedOutput == null || namedOutput.length() == 0) {
      throw new IllegalArgumentException(
        "Name cannot be NULL or emtpy");
    }
    for (char ch : namedOutput.toCharArray()) {
      if ((ch >= 'A') && (ch <= 'Z')) {
        continue;
      }
      if ((ch >= 'a') && (ch <= 'z')) {
        continue;
      }
      if ((ch >= '0') && (ch <= '9')) {
        continue;
      }
      throw new IllegalArgumentException(
        "Name cannot be have a '" + ch + "' char");
    }
  }

程序运行成功输出：

模式：  192.168.75.130:9001  
输出路径存在，已删除！  
WARN - JobClient.copyAndConfigureFiles(746) | Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.  
INFO - FileInputFormat.listStatus(237) | Total input paths to process : 1  
WARN - NativeCodeLoader.<clinit>(52) | Unable to load native-hadoop library for your platform... using builtin-java classes where applicable  
WARN - LoadSnappy.<clinit>(46) | Snappy native library not loaded  
INFO - JobClient.monitorAndPrintJob(1380) | Running job: job_201404101853_0006  
INFO - JobClient.monitorAndPrintJob(1393) |  map 0% reduce 0%  
INFO - JobClient.monitorAndPrintJob(1393) |  map 100% reduce 0%  
INFO - JobClient.monitorAndPrintJob(1393) |  map 100% reduce 33%  
INFO - JobClient.monitorAndPrintJob(1393) |  map 100% reduce 100%  
INFO - JobClient.monitorAndPrintJob(1448) | Job complete: job_201404101853_0006  
INFO - Counters.log(585) | Counters: 29  
INFO - Counters.log(587) |   Job Counters   
INFO - Counters.log(589) |     Launched reduce tasks=1  
INFO - Counters.log(589) |     SLOTS_MILLIS_MAPS=9289  
INFO - Counters.log(589) |     Total time spent by all reduces waiting after reserving slots (ms)=0  
INFO - Counters.log(589) |     Total time spent by all maps waiting after reserving slots (ms)=0  
INFO - Counters.log(589) |     Launched map tasks=1  
INFO - Counters.log(589) |     Data-local map tasks=1  
INFO - Counters.log(589) |     SLOTS_MILLIS_REDUCES=13645  
INFO - Counters.log(587) |   File Output Format Counters   
INFO - Counters.log(589) |     Bytes Written=0  
INFO - Counters.log(587) |   FileSystemCounters  
INFO - Counters.log(589) |     FILE_BYTES_READ=105  
INFO - Counters.log(589) |     HDFS_BYTES_READ=203  
INFO - Counters.log(589) |     FILE_BYTES_WRITTEN=113616  
INFO - Counters.log(589) |     HDFS_BYTES_WRITTEN=87  
INFO - Counters.log(587) |   File Input Format Counters   
INFO - Counters.log(589) |     Bytes Read=91  
INFO - Counters.log(587) |   Map-Reduce Framework  
INFO - Counters.log(589) |     Map output materialized bytes=105  
INFO - Counters.log(589) |     Map input records=6  
INFO - Counters.log(589) |     Reduce shuffle bytes=105  
INFO - Counters.log(589) |     Spilled Records=12  
INFO - Counters.log(589) |     Map output bytes=87  
INFO - Counters.log(589) |     Total committed heap usage (bytes)=176033792  
INFO - Counters.log(589) |     CPU time spent (ms)=1880  
INFO - Counters.log(589) |     Combine input records=0  
INFO - Counters.log(589) |     SPLIT_RAW_BYTES=112  
INFO - Counters.log(589) |     Reduce input records=6  
INFO - Counters.log(589) |     Reduce input groups=3  
INFO - Counters.log(589) |     Combine output records=0  
INFO - Counters.log(589) |     Physical memory (bytes) snapshot=278876160  
INFO - Counters.log(589) |     Reduce output records=0  
INFO - Counters.log(589) |     Virtual memory (bytes) snapshot=1460908032  
INFO - Counters.log(589) |     Map output records=6

模式：  192.168.75.130:9001
输出路径存在，已删除！
WARN - JobClient.copyAndConfigureFiles(746) | Use GenericOptionsParser for parsing the arguments. Applications should implement Tool for the same.
INFO - FileInputFormat.listStatus(237) | Total input paths to process : 1
WARN - NativeCodeLoader.<clinit>(52) | Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
WARN - LoadSnappy.<clinit>(46) | Snappy native library not loaded
INFO - JobClient.monitorAndPrintJob(1380) | Running job: job_201404101853_0006
INFO - JobClient.monitorAndPrintJob(1393) |  map 0% reduce 0%
INFO - JobClient.monitorAndPrintJob(1393) |  map 100% reduce 0%
INFO - JobClient.monitorAndPrintJob(1393) |  map 100% reduce 33%
INFO - JobClient.monitorAndPrintJob(1393) |  map 100% reduce 100%
INFO - JobClient.monitorAndPrintJob(1448) | Job complete: job_201404101853_0006
INFO - Counters.log(585) | Counters: 29
INFO - Counters.log(587) |   Job Counters 
INFO - Counters.log(589) |     Launched reduce tasks=1
INFO - Counters.log(589) |     SLOTS_MILLIS_MAPS=9289
INFO - Counters.log(589) |     Total time spent by all reduces waiting after reserving slots (ms)=0
INFO - Counters.log(589) |     Total time spent by all maps waiting after reserving slots (ms)=0
INFO - Counters.log(589) |     Launched map tasks=1
INFO - Counters.log(589) |     Data-local map tasks=1
INFO - Counters.log(589) |     SLOTS_MILLIS_REDUCES=13645
INFO - Counters.log(587) |   File Output Format Counters 
INFO - Counters.log(589) |     Bytes Written=0
INFO - Counters.log(587) |   FileSystemCounters
INFO - Counters.log(589) |     FILE_BYTES_READ=105
INFO - Counters.log(589) |     HDFS_BYTES_READ=203
INFO - Counters.log(589) |     FILE_BYTES_WRITTEN=113616
INFO - Counters.log(589) |     HDFS_BYTES_WRITTEN=87
INFO - Counters.log(587) |   File Input Format Counters 
INFO - Counters.log(589) |     Bytes Read=91
INFO - Counters.log(587) |   Map-Reduce Framework
INFO - Counters.log(589) |     Map output materialized bytes=105
INFO - Counters.log(589) |     Map input records=6
INFO - Counters.log(589) |     Reduce shuffle bytes=105
INFO - Counters.log(589) |     Spilled Records=12
INFO - Counters.log(589) |     Map output bytes=87
INFO - Counters.log(589) |     Total committed heap usage (bytes)=176033792
INFO - Counters.log(589) |     CPU time spent (ms)=1880
INFO - Counters.log(589) |     Combine input records=0
INFO - Counters.log(589) |     SPLIT_RAW_BYTES=112
INFO - Counters.log(589) |     Reduce input records=6
INFO - Counters.log(589) |     Reduce input groups=3
INFO - Counters.log(589) |     Combine output records=0
INFO - Counters.log(589) |     Physical memory (bytes) snapshot=278876160
INFO - Counters.log(589) |     Reduce output records=0
INFO - Counters.log(589) |     Virtual memory (bytes) snapshot=1460908032
INFO - Counters.log(589) |     Map output records=6

运行成功后，生成的文件如下所示：

使用Hadoop处理数据时，在Reduce阶段
china-r-00000里面的数据如下：

中国  我们  
中国  123

中国	我们
中国	123

USA-r-00000里面的数据如下：

美国  他们  
美国  USA  
美国  在北美洲

美国	他们
美国	USA
美国	在北美洲

cperson-r-00000里面的数据如下：

中国人  善良

中国人	 善良

在输出结果中，reduce自带的那个文件仍然会输出，但是里面没有任何数据，至此，我们已经在hadoop1.2.0的基于新的API里，测试多文件输出通过。

apache hadoop lib

玻璃心

0 关注 0 粉丝 0 动态

关注关注

为什么Java仍将是未来的主流语言？

Java是一种通用编程语言，1995年由Sun Micro-systems公司开发。尽管已经有25年的历史，但它仍然统治着整个世界。根据Stack-overflow的开发者调查，它在2019年最受欢迎的语言中排名第5。超过41%的调查用户将Java标记为

minerd 11评论 2020-10-28

.NET Core下使用Kafka的方法步骤

# 修改每个topic的默认分区参数num.partitions，默认是1，具体合适的取值需要根据服务器配置进程确定，UCloud.ukafka = 3. Segment：partition物理上由多个segment组成，下面2.2和2.3有详细说明。pa

Kafka 2020-09-18

解决PHPstudy Apache无法启动的问题【亲测有效】

原因一是防火墙拦截，关闭防火墙。二是80端口已经被别的程序占用，如IIS，迅雷等；三是没有安装VC9运行库,php和apache都是VC9编译。解决以上三个问题，基本上都是可以一次安装完成的。但是，But，但是，上面的方法都试过之后还是无法启动呢？最有效，

Wepe0 2020-10-30

Web安全：文件解析漏洞

PHP是用C语言编写的，MySQL则是用C++编写的，而Apache则大部分是使用C语言编写的，少部分是使用C++编写的。所以，文件解析漏洞的本质还是需要我们挖掘C语言、C++的漏洞。文件解析漏洞是指中间件在解析文件时出现了漏洞，从而攻击者可以利用该漏洞实

杜倩 14评论 2020-10-29

终于有人把Nginx说清楚了，图文详解！

想必大家一定听说过 Nginx，若没听说过它，那么一定听过它的"同行"Apache 吧！这也使得各个 Web 服务器有着各自鲜明的特点。Apache 的发展时期很长，而且是毫无争议的世界第一大服务器。这些都决定了 Apache 不可能成

windle 2020-10-29

如何使用Apache Web服务器来安装和配置网站？

Apache可与MySQL、PHP及另外大量软件包无缝协作，以便您架设简单的静态或动态网站。如何安装和配置服务器?您将文件放置在哪里?不妨介绍这方面，每次一个步骤。我将在Ubuntu Server 20.04上进行演示。不过先介绍一点基础知识。比如说，在基

mengzuchao 2020-10-22

CentOS 8 Apache 安装后 SSL 重定向提示证书错误

在启用 SSL 后，我们也希望将主域名重定向到 www。如果我们按照 80 端口的 http 配置的，你可能会得到浏览器的安全配置。这是因为如果你在 HTTPS 的 SSL 中也这样配置是不允许的。因为这个可能会导致安全性问题和攻击。简单来说，出现这个问题

Junzizhiai 2020-10-10

如何使用 Apache Directory Studio 连接 JumpCloud

JumpCloud 是一个基于云的 LDAP 服务。如果你的项目小组成员在 10 个或者 10 个以下的话，你可以免费使用 JumpCloud 服务器。这篇文章假设的是你已经设置好了 JumpCloud 的云服务，并且已经注册了 JumpCloud 的管理

bxqybxqy 2020-09-30

初学者和专业技术人员使用的十大机器学习软件

现在，让我们详细讨论每个机器学习软件。Tensorflow是机器学习的免费开源工具。这些应用程序在高级C ++中执行。它用于图像识别，手写分类，递归神经网络等。Tensorflow可以在CPU和GPU上平稳运行。它提供了良好的库来防止长时间编码。它是全球

风之沙城 2020-09-24

每个Java开发人员都应该知道的10大Github仓库

Java是业务应用程序开发中排名第一的编程语言，它也是顶级编程语言之一。Java具有许多功能强大且丰富的仓库，尽管标准Java库功能强大，但是在专业软件开发领域你还需要其他Java库。它在业界广泛使用，如果你正在从事一个大型项目，并且没有使用任何Apach

kingszelda 2020-09-22

漫话：应用程序被拖慢？罪魁祸首竟然是Log4j！

之前一段时间，为我们发现的一个SaaS应用程序会间歇性地卡顿、变慢，因为很长时间都没有定位到原因，所以解决的办法就只能是重启。这个现象和之前我们遇到的程序变得卡顿不太一样，因为我们发现这个应用程序不仅在高流量期间时会变慢，有时在低流量时期也会变慢。所以这令

大唐帝国前营 2020-08-18

JSP动态网页开发原理详解

　 JSP全称是Java Server Pages，它和servle技术一样，都是SUN公司定义的一种用于开发动态web资源的技术。　　JSP这门技术的最大的特点在于，写jsp就像在写html，但它相比html而言，html只能为用户提供静态数据，而Jsp

yixu0 2020-08-17

centos8使用Apache httpd2.4.37安装web服务器的步骤详解

第四步：firewall-cmd --zone=public --add-port=80/tcp --permanent　　#开启防火墙的80端口，并永久生效，返回success 设置成功的意思。第六步：firewall-cmd --zone=public

TangCuYu 2020-08-15

Tomcat启动springboot项目war包报错：启动子级时出错的问题

今天公司springboot项目准备部署到测试服务器上进行测试，打包好war后放到tomcat里面启动后，前端文件能访问到，但是接口请求一直是404，一直找了很久的原因，tomcat启动是成功的，war打包的时候也提示build success了，tomc

xiaoboliu00 2020-08-15

如何通过Apache在本地配置多个虚拟主机

如何使用 Apache 在本地配置出多个虚拟主机呢？而且使用不同的“域名”来访问本地不同的站点呢？1，根目录中有一个 phpMyAdmin/ 的文件夹，它是一个网页版的数据库管理系统（肯定不会陌生的吧！），我想通过访问一个简单的:. 来进入这个系统；这样，

songshijiazuaa 2020-08-15

Apache Shiro 反序列化(CVE-2016-4437)复现

shiro默认使用CookieRememberMeManager，对rememberMe的cookie做了加密处理，在CookieRememberMeManaer类中将cookie中rememberMe字段内容先后进行序列化、AES加密、Base64编码操

xclxcl 2020-08-03

Apache Shiro 反序列化(CVE-2016-4437)复现

zmzmmf 2020-08-03

Apache DolphinScheduler 诞生记

DolphinScheduler，简称”DS”, 中文名 “小海豚调度”。希望 DolphinScheduler 就像它的名字一样，成为一个“开箱即用”的灵活易用的调度系统。DAG 全称Directed Acyclic Graph，简称DAG。工作流中的T

newfarhui 2020-08-03

【Shiro】05 自定义Realm认证实现

Shiro默认使用自带的IniRealm，IniRealm从ini配置文件中读取用户的信息，大部分情况下需要从系统的数据库中读取用户信息，所以需要自定义realm。public abstract class CachingRealm implements

likesyour 2020-08-01

Web容器Web服务器及常见的Web容器有哪些？

　　首先来理解一下简单的一个请求发送到响应的过程。而我们的服务器通常要分为两个部分，一部分是服务器硬件，有了硬件之后还有有硬件上对应运行的软件。其次，服务器的硬件部分接收到了这一段请求，将其递交给对应的进程，服务器软件。此时这个服务器软件即为Web服务器，

lantingyue 2020-07-30

安科网

使用Hadoop处理数据时，在Reduce阶段

玻璃心

玻璃心

相关推荐

为什么Java仍将是未来的主流语言？

.NET Core下使用Kafka的方法步骤

解决PHPstudy Apache无法启动的问题【亲测有效】

Web安全：文件解析漏洞

终于有人把Nginx说清楚了，图文详解！

如何使用Apache Web服务器来安装和配置网站？

CentOS 8 Apache 安装后 SSL 重定向提示证书错误

如何使用 Apache Directory Studio 连接 JumpCloud

初学者和专业技术人员使用的十大机器学习软件

每个Java开发人员都应该知道的10大Github仓库

漫话：应用程序被拖慢？罪魁祸首竟然是Log4j！

JSP动态网页开发原理详解

centos8使用Apache httpd2.4.37安装web服务器的步骤详解

Tomcat启动springboot项目war包报错：启动子级时出错的问题

如何通过Apache在本地配置多个虚拟主机

Apache Shiro 反序列化(CVE-2016-4437)复现

Apache Shiro 反序列化(CVE-2016-4437)复现

Apache DolphinScheduler 诞生记

【Shiro】05 自定义Realm认证实现

Web容器Web服务器及常见的Web容器有哪些？

玻璃心