首页  ·  知识 ·  大数据
Hadoop实战演练:搜索数据分析----数据去重(1)
网友  CSDN博客  综合  编辑:黎昕   图片来源:网络
用户ID是根据用户使用浏览器访问搜索引擎时的Cookie信息自动赋值,即同一次使用浏览器输入的不同查询对应同一个用户ID

一、环境与数据


1、本地开发环境 

Windows7 + Eclipse Luna

hadoop版本:2.6.2

JDK版本:1.8


2、数据来源:

搜狗实验室

http://www.sogou.com/labs/resource/q.php


3、数据格式

用户ID\t[查询词]\t该URL在返回结果中的排名\t用户点击的顺序号\t用户点击的URL
其中,用户ID是根据用户使用浏览器访问搜索引擎时的Cookie信息自动赋值,即同一次使用浏览器输入的不同查询对应同一个用户ID

样例:

blob.png

3、数据去重

每一人的同一个搜索关键词能可会有多个点击条数。这里要去除同一个人同一个关键词重复的条数

二、编程实现

  1. package com.lin.keyword;  


  2. import java.io.IOException;  

  3. import java.util.StringTokenizer;  


  4. import org.apache.hadoop.conf.Configuration;  

  5. import org.apache.hadoop.fs.Path;  

  6. import org.apache.hadoop.io.Text;  

  7. import org.apache.hadoop.mapreduce.Job;  

  8. import org.apache.hadoop.mapreduce.Mapper;  

  9. import org.apache.hadoop.mapreduce.Reducer;  

  10. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;  

  11. import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;  

  12. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  

  13. import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;  

  14. import org.apache.hadoop.util.GenericOptionsParser;  


  15. /** 

  16.  * 功能概要:数据去重 

  17.  *  

  18.  * @author linbingwen 

  19.  * @since 2016年7月31日 

  20.  */

  21. publicclass CleanSameData {  

  22.     // map将输入中的value复制到输出数据的key上,并直接输出

  23.     publicstaticclass Map extends Mapper<Object, Text, Text, Text> {  


  24.         // 实现map函数

  25.         @Override

  26.         publicvoid map(Object key, Text value, Context context) throws IOException, InterruptedException {  

  27.             // 将输入的纯文本文件的数据转化成String

  28.             String line = value.toString();  

  29.             // 将输入的数据首先按行进行分割

  30.             StringTokenizer tokenizerArticle = new StringTokenizer(line, "\n");  

  31.             // 分别对每一行进行处理

  32.             while (tokenizerArticle.hasMoreElements()) {  

  33.                 // 每行按空格划分

  34.                 StringTokenizer tokenizerLine = new StringTokenizer(tokenizerArticle.nextToken());  

  35.                 String c1 = tokenizerLine.nextToken();//

  36.                 String c2 = tokenizerLine.nextToken();// 关键词

  37.                 c2 = c2.substring(1, c2.length() - 1);  

  38.                 Text newline = new Text(c1 + "    " + c2);  

  39.                 context.write(newline, new Text(""));  

  40.             }  


  41.         }  


  42.     }  


  43.     // reduce将输入中的key复制到输出数据的key上,并直接输出

  44.     publicstaticclass Reduce extends Reducer<Text, Text, Text, Text> {  

  45.         // 实现reduce函数

  46.         @Override

  47.         publicvoid reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {  

  48.             context.write(key, new Text(""));  

  49.         }  

  50.     }  


  51.     publicstaticvoid main(String[] args) throws Exception {  


  52.         Configuration conf = new Configuration();  

  53.         // 设置hadoop的机器、端口

  54.         conf.set("mapred.job.tracker""10.75.201.125:9000");  

  55.         // 设置输入输出文件目录

  56.         String[] ioArgs = new String[] { "hdfs://hmaster:9000/data_in""hdfs://hmaster:9000/clean_same_out" };  

  57.         String[] otherArgs = new GenericOptionsParser(conf, ioArgs).getRemainingArgs();  

  58.         if (otherArgs.length != 2) {  

  59.             System.err.println("Usage:  <in> <out>");  

  60.             System.exit(2);  

  61.         }  

  62.         // 设置一个job

  63.         Job job = Job.getInstance(conf, "clean same data");  

  64.         job.setJarByClass(CleanSameData.class);  


  65.         // 设置Map、Combine和Reduce处理类

  66.         job.setMapperClass(Map.class);  

  67.         job.setCombinerClass(Reduce.class);  

  68.         job.setReducerClass(Reduce.class);  


  69.         // 设置输出类型

  70.         job.setOutputKeyClass(Text.class);  

  71.         job.setOutputValueClass(Text.class);  


  72.         // 将输入的数据集分割成小数据块splites,提供一个RecordReder的实现

  73.         job.setInputFormatClass(TextInputFormat.class);  


  74.         // 提供一个RecordWriter的实现,负责数据输出

  75.         job.setOutputFormatClass(TextOutputFormat.class);  


  76.         // 设置输入和输出目录

  77.         FileInputFormat.addInputPath(job, new Path(otherArgs[0]));  

  78.         FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));  

  79.         System.exit(job.waitForCompletion(true) ? 0 : 1);  


  80.     }  


  81. }  


maven文件:

  1. <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"

  2.     xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">  

  3.     <modelVersion>4.0.0</modelVersion>  

  4.     <groupId>com.lin</groupId>  

  5.     <artifactId>zdy-hadoop</artifactId>  

  6.     <version>0.0.1-SNAPSHOT</version>  

  7.     <properties>  

  8.         <!-- spring版本号 -->  

  9.         <spring.version>4.3.0.RELEASE</spring.version>  

  10.         <!-- log4j日志文件管理包版本 -->  

  11.         <slf4j.version>1.6.6</slf4j.version>  

  12.         <log4j.version>1.2.12</log4j.version>  

  13.         <!-- junit版本号 -->  

  14.         <junit.version>4.12</junit.version>  

  15.         <hadoop.version>2.6.0</hadoop.version>  

  16.         <!-- mybatis版本号 -->  

  17.         <mybatis.version>3.2.1</mybatis.version>  

  18.         <mysql.connect.version>6.0.3</mysql.connect.version>  

  19.     </properties>  


  20.     <dependencies>          

  21.         <!--单元测试依赖 -->  

  22.         <dependency>  

  23.             <groupId>junit</groupId>  

  24.             <artifactId>junit</artifactId>  

  25.             <version>${junit.version}</version>  

  26.             <scope>test</scope>  

  27.         </dependency>  

  28.         <!-- logback start -->  

  29.         <dependency>  

  30.             <groupId>log4j</groupId>  

  31.             <artifactId>log4j</artifactId>  

  32.             <version>${log4j.version}</version>  

  33.         </dependency>  

  34.         <dependency>  

  35.             <groupId>org.slf4j</groupId>  

  36.             <artifactId>slf4j-api</artifactId>  

  37.             <version>${slf4j.version}</version>  

  38.         </dependency>  

  39.         <dependency>  

  40.             <groupId>ch.qos.logback</groupId>  

  41.             <artifactId>logback-classic</artifactId>  

  42.             <version>1.1.2</version>  

  43.         </dependency>  

  44.         <dependency>  

  45.             <groupId>ch.qos.logback</groupId>  

  46.             <artifactId>logback-core</artifactId>  

  47.             <version>1.1.2</version>  

  48.         </dependency>  

  49.         <dependency>  

  50.             <groupId>org.logback-extensions</groupId>  

  51.             <artifactId>logback-ext-spring</artifactId>  

  52.             <version>0.1.1</version>  

  53.         </dependency>  

  54.         <dependency>  

  55.             <groupId>org.apache.hadoop</groupId>  

  56.             <artifactId>hadoop-common</artifactId>  

  57.             <version>2.6.0</version>  

  58.         </dependency>  

  59.         <dependency>  

  60.             <groupId>org.apache.hadoop</groupId>  

  61.             <artifactId>hadoop-hdfs</artifactId>  

  62.             <version>2.6.0</version>  

  63.         </dependency>  

  64.         <dependency>  

  65.             <groupId>org.apache.hadoop</groupId>  

  66.             <artifactId>hadoop-client</artifactId>  

  67.             <version>2.6.0</version>  

  68.         </dependency>  

  69.         <dependency>  

  70.             <groupId>jdk.tools</groupId>  

  71.             <artifactId>jdk.tools</artifactId>  

  72.             <version>1.8</version>  

  73.             <scope>system</scope>  

  74.             <systemPath>${JAVA_HOME}/lib/tools.jar</systemPath>  

  75.         </dependency>  

  76.          <!-- mysql驱动包 -->  

  77.         <dependency>  

  78.             <groupId>mysql</groupId>  

  79.             <artifactId>mysql-connector-java</artifactId>  

  80.             <version>${mysql.connect.version}</version>  

  81.         </dependency>  

  82.     </dependencies>  


  83.     <build>  

  84.         <finalName>zdy-hadoop</finalName>  

  85.         <filters>  

  86.             <!-- 表示打包哪个环境下的properties文件 -->  

  87.             <filter>src/main/resources/env/${env}.properties</filter>  

  88.         </filters>  

  89.         <!-- 配置将properties时的变量用pom.xml里的变量赋值替换 -->  

  90.         <resources>  

  91.             <resource>  

  92.                 <directory>src/main/resources</directory>  

  93.                 <targetPath>${basedir}/target/classes</targetPath>  

  94.                 <includes>  

  95.                     <include>**/*.properties</include> 

  96.                     <include>**/*.xml</include>  

  97.                 </includes>  

  98.                 <filtering>true</filtering>  

  99.             </resource>  

  100.             <resource>  

  101.                 <directory>src/main/resources</directory>  

  102.                 <targetPath>${basedir}/target/resources</targetPath>  

  103.                 <includes>  

  104.                     <include>**/*.properties</include> 

  105.                     <include>**/*.xml</include>  

  106.                 </includes>  

  107.                 <filtering>true</filtering>  

  108.             </resource>  

  109.         </resources>  


  110.         <!-- </plugins> -->  

  111.         <plugins>  

  112.             <plugin>  

  113.                 <groupId>org.apache.maven.plugins</groupId>  

  114.                 <artifactId>maven-compiler-plugin</artifactId>  

  115.                 <version>2.0.2</version>  

  116.                 <configuration>  

  117.                     <source>1.8</source>  

  118.                     <target>1.8</target>  

  119.                     <encoding>UTF-8</encoding>  

  120.                 </configuration>  

  121.             </plugin>  

  122.             <plugin>  

  123.                 <groupId>org.apache.maven.plugins</groupId>  

  124.                 <artifactId>maven-jar-plugin</artifactId>  

  125.                 <executions>  

  126.                     <!-- 定义在prepare-package时将classes/com打jar -->  

  127.                     <execution>  

  128.                         <phase>prepare-package</phase>  

  129.                         <goals>  

  130.                             <goal>jar</goal>  

  131.                         </goals>  

  132.                         <configuration>  

  133.                             <includes>  

  134.                                 <include>com/**</include>  

  135.                             </includes>  

  136.                         </configuration>  

  137.                     </execution>  

  138.                 </executions>  

  139.             </plugin>  

  140.             <plugin>  

  141.                 <artifactId>maven-assembly-plugin</artifactId>  

  142.                 <configuration>  

  143.                     <!-- not append assembly id in release file name -->  

  144.                     <appendAssemblyId>false</appendAssemblyId>  

  145.                     <finalName>${project.artifactId}</finalName>  

  146.                     <descriptors>  

  147.                         <descriptor>src/main/assemble/assembly.xml</descriptor>  

  148.                     </descriptors>  

  149.                 </configuration>  

  150.                 <executions>  

  151.                     <execution>  

  152.                         <id>make-assembly</id>  

  153.                         <phase>package</phase>  

  154.                         <goals>  

  155.                             <goal>single</goal>  

  156.                         </goals>  

  157.                     </execution>  

  158.                 </executions>  

  159.             </plugin>  

  160.     </plugins>  

  161.     </build>  

  162.     <profiles>  

  163.         <!-- 开发环境,配置文件放在src/main/resources/env下 -->  

  164.         <profile>  

  165.             <id>dev</id>  

  166.             <activation>  

  167.                 <activeByDefault>true</activeByDefault>  

  168.             </activation>  

  169.             <properties>  

  170.                 <env>dev</env>  

  171.             </properties>  

  172.         </profile>  

  173.     </profiles>  

  174. </project>  



输出结果:

blob.png

本文作者:网友 来源:CSDN博客
CIO之家 www.ciozj.com 微信公众号:imciow
    >>频道首页  >>网站首页   纠错  >>投诉
版权声明:CIO之家尊重行业规范,每篇文章都注明有明确的作者和来源;CIO之家的原创文章,请转载时务必注明文章作者和来源;
延伸阅读
也许感兴趣的
我们推荐的
主题最新
看看其它的