HADOOP笔试.doc-道客多多_道客多多docduoduo.com

资源描述

1、1 使用 Hive 或者自定义 MR 实现如下逻辑product_no lac_id moment start_time user_id county_id staytime city_id13429100031 22554 8 2013-03-11 08:55:19.151754088 571 571 282 57113429100082 22540 8 2013-03-11 08:58:20.152622488 571 571 270 57113429100082 22691 8 2013-03-11 08:56:37.149593624 571 571 103 5711342910008

2、7 22705 8 2013-03-11 08:56:51.139539816 571 571 220 57113429100087 22540 8 2013-03-11 08:55:45.150276800 571 571 66 57113429100082 22540 8 2013-03-11 08:55:38.140225200 571 571 133 57113429100140 26642 9 2013-03-11 09:02:19.151754088 571 571 18 57113429100082 22691 8 2013-03-11 08:57:32.151754088 57

3、1 571 287 57113429100189 22558 8 2013-03-11 08:56:24.139539816 571 571 48 57113429100349 22503 8 2013-03-11 08:54:30.152622440 571 571 211 571字段解释：product_no：用户手机号；lac_id：用户所在基站；start_time：用户在此基站的开始时间；staytime：用户在此基站的逗留时间。需求描述：根据 lac_id 和 start_time 知道用户当时的位置，根据 staytime 知道用户各个基站的逗留时长。根据轨迹合并连续基站的 st

4、aytime。最终得到每一个用户按时间排序在每一个基站驻留时长期望输出举例：13429100082 22540 8 2013-03-11 08:58:20.152622488 571 571 270 57113429100082 22691 8 2013-03-11 08:56:37.149593624 571 571 390 57113429100082 22540 8 2013-03-11 08:55:38.140225200 571 571 133 57113429100087 22705 8 2013-03-11 08:56:51.139539816 571 571 220 57113

5、429100087 22540 8 2013-03-11 08:55:45.150276800 571 571 66 5712 Linux 脚本能力考察2.1 请随意使用各种类型的脚本语言实现：批量将指定目录下的所有文件中的$HADOOP_HOME$替换成/home/ocetl/app/hadoop2.2 假设有 10 台主机，H1 到 H10，在开启 SSH 互信的情况下，编写一个或多个脚本实现在所有的远程主机上执行脚本的功能例如：runRemoteCmd.sh “ls -l“期望结果：H1:XXXXXXXXXXXXXXXXXXXXXXXXH2:XXXXXXXXXXXXXXXXXXXXXXX

6、XH3:.3 Hadoop 基础知识与问题分析的能力 3.1 描述一下 hadoop 中，有哪些地方使用了缓存机制，作用分别是什么3.2 请描述 https:/issues.apache.org/jira/browse/HDFS-2379 说的是什么问题，最终解决的思路是什么？4 MapReduce 开发能力请参照 wordcount 实现一个自己的 map reduce，需求为：a 输入文件格式：xxx,xxx,xxx,xxx,xxx,xxx,xxxb 输出文件格式：xxx,20xxx,30xxx.40c 功能：根据命令行参数统计输入文件中指定关键字出现的次数，并展示出来例如：hadoop

7、 jar xxxxx.jar keywordcount xxx,xxx,xxx,xxx(四个关键字）5 MapReduce 优化请根据第五题中的程序, 提出如何优化 MR 程序运行速度的思路6 Linux 操作系统知识考察请列举曾经修改过的/etc 下的配置文件，并说明修改要解决的问题？7 Java 开发能力7.1 写代码实现 1G 大小的文本文件，行分隔符为x01x02,统计一下该文件中的总行数，要求注意边界情况的处理7.2 请描述一下在开发中如何对上面的程序进行性能分析，对性能进行优化的过程答案如下：1. 考虑后，决定使用 MR 来实现，于是使用 Java，用一个 MR Job 完成这个事

8、情： 1. package org.aboutyun;2.3. import mons.lang.StringUtils;4. import org.apache.hadoop.conf.Configuration;5. import org.apache.hadoop.fs.Path;6. import org.apache.hadoop.io.LongWritable;7. import org.apache.hadoop.io.Text;8. import org.apache.hadoop.mapreduce.Job;9. import org.apache.hadoop.mapred

9、uce.Mapper;10. import org.apache.hadoop.mapreduce.Reducer;11. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;12. import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;13. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;14. import org.apache.hadoop.mapreduce.lib.ou

10、tput.TextOutputFormat;15.16. import java.io.IOException;17. import java.text.ParseException;18. import java.text.SimpleDateFormat;19. import java.util.ArrayList;20. import java.util.Collections;21. import java.util.Comparator;22.23. public class TimeCount 24. public static void main(String args) thr

11、ows Exception 25. Configuration conf = new Configuration();26.27. Job job = new Job(conf, “time_count“);28.29. job.setOutputKeyClass(Text.class);30. job.setOutputValueClass(Text.class);31.32. job.setMapperClass(Map.class);33. job.setReducerClass(Reduce.class);34.35. job.setInputFormatClass(TextInput

12、Format.class);36. job.setOutputFormatClass(TextOutputFormat.class);37.38. FileInputFormat.addInputPath(job, new Path(args0);39. FileOutputFormat.setOutputPath(job, new Path(args1);40.41. job.waitForCompletion(true);42. 43.44. public static class Map extends Mapper 45. private Text id = new Text();46

13、. private Text row = new Text();47.48. public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException 49. String line = value.toString();50. String items = line.split(“t“);51.52. if (items.length = 8) 53. if (StringUtils.isNumeric(items6) 54. id.set(items0 +

14、“t“ + items1);55. row.set(line);56. context.write(id, row);57. 58. else 59. System.out.println(“Wrong length: “ + items.length);60. 61. 62. 63.64. public static class Reduce extends Reducer 65. private static final SimpleDateFormat format = new SimpleDateFormat(“yyyy-MM-dd HH:mm:ss“);66.67. static 6

15、8. format.setLenient(false);69. 70.71. private Text rest = new Text();72.73. public void reduce(Text key, Iterable values, Context context)74. throws IOException, InterruptedException 75. / Parse row to Record76. ArrayList list = new ArrayList();77. for (Text row : values) 78. String items = row.toS

16、tring().split(“t“);79. try 80. Record record = new Record();81. record.items = items;82. record.start_time = format.parse(items3).getTime();83. record.stay_time = Long.parseLong(items6) * 1000;84. list.add(record);85. catch (ParseException e) 86. e.printStackTrace();87. 88.89. 90.91. / Sort92. Colle

17、ctions.sort(list, new Comparator() 93. Override94. public int compare(Record r1, Record r2) 95. return (int) (r1.start_time - r2.start_time);96. 97. );98.99. / Find and merge slice100. ArrayList result = new ArrayList();101. for (Record r1 : list) 102. boolean found = false;103. long r1_stop_time =

18、r1.start_time + r1.stay_time;104. for (Record r2 : result) 105. long r2_stop_time = r2.start_time + r2.stay_time;106. if (r1.start_time r2.start_time 109. found = true;110. 111. 112.113. if (!found) 114. result.add(r1);115. 116. 117.118. / Output119. for (Record r : result) 120. key.set(r.items0);12

19、1.122. String value = r.items1 + “t“123. + r.items2 + “t“124. + r.items3 + “t“125. + r.items4 + “t“126. + r.items5 + “t“127. + (r.stay_time / 1000) + “t“128. + r.items6 + “t“;129. rest.set(value);130.131. context.write(key, rest);132. 133.134. 135.136. static class Record 137. String items;138. long

20、 start_time;139. long stay_time;140. 141. 142.复制代码2. 2.1 使用 find + sed 来实现：find /home/ocetl/app/hadoop -exec sed -i s/$HADOOP_HOME$/home/ocetl/app/hadoop/g ;2.2 直接使用 ssh 的参数 1. #!/bin/bash2. if $# -ne 1 3. then4. echo “Usage: basename $0 command“5. exit6. fi7.8. for i in H1 H2 H3 H4 H5 H6 H7 H8 H9 H

21、109. do10. echo “$i:“11. ssh $i “$1“12. done复制代码3. 3.1 不了解，HDFS 用了缓存3.2 问题是当硬盘空间很大，而内存页面缓存很少的时候，DN 的 Block report 需要很长时间生成，而此时 FSVolumeSet 锁是锁住的，因此所有读写操作都无法执行，最终导致那些操作超时。此问题是建议提供一种方法使 block report 不需要持有 FSVolumeSet 锁，从而不会导致那些任务失败。4. 只是替换分隔符从空格到逗号，以及增加搜索关键字列表： 1. import org.apache.hadoop.conf.Configu

22、ration;2. import org.apache.hadoop.fs.Path;3. import org.apache.hadoop.io.IntWritable;4. import org.apache.hadoop.io.LongWritable;5. import org.apache.hadoop.io.Text;6. import org.apache.hadoop.mapreduce.Job;7. import org.apache.hadoop.mapreduce.Mapper;8. import org.apache.hadoop.mapreduce.Reducer;9

23、. import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;10. import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;11. import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;12. import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;13.14. import java.io.IOException;15.

24、import java.util.ArrayList;16.17. public class WordCount 18.19. public static class Map extends Mapper 20. private final static IntWritable one = new IntWritable(1);21. private Text word = new Text();22. private final static ArrayList target_words = new ArrayList();23.24. public void map(LongWritabl

25、e key, Text value, Context context) throws IOException, InterruptedException 25. String items = value.toString().toLowerCase().replaceAll(“pPunct“, “).split(“s+“);26. for (String item : items) 27. if (target_words.contains(item) 28. word.set(item);29. context.write(word, one);30. 31. 32. 33.34. publ

26、ic static void clear() 35. target_words.clear();36. 37.38. public static void add(String word) 39. target_words.add(word);40. 41. 42.43. public static class Reduce extends Reducer 44.45. public void reduce(Text key, Iterable values, Context context)46. throws IOException, InterruptedException 47. in

27、t sum = 0;48. for (IntWritable val : values) 49. sum += val.get();50. 51. context.write(key, new IntWritable(sum);52. 53. 54.55. public static void main(String args) throws Exception 56. Configuration conf = new Configuration();57.58. if (args.length “);60. return;61. 62.63. / Add to target64. Strin

28、g target_words = args2.split(“,“);65. for (String word : target_words) 66. Map.add(word.toLowerCase();67. 68.69. Job job = new Job(conf, “wordcount“);70.71. job.setOutputKeyClass(Text.class);72. job.setOutputValueClass(IntWritable.class);73.74. job.setMapperClass(Map.class);75. job.setReducerClass(R

29、educe.class);76.77. job.setInputFormatClass(TextInputFormat.class);78. job.setOutputFormatClass(TextOutputFormat.class);79.80. FileInputFormat.addInputPath(job, new Path(args0);81. FileOutputFormat.setOutputPath(job, new Path(args1);82.83. job.waitForCompletion(true);84. 85.86. 复制代码5. 第五题的程序是什么？6. h

30、osts：增加局域网主机名和 ip 对应关系，省得再记住 ip;hostname：该主机名，克隆虚拟机的时候经常需要这么做；fstab：修改挂载点，加新硬盘的时候会需要；profile, bash.bashrc: 修改系统范围环境变量时经常用；network/interfaces：配置静态 IP 时需要。77.1 1. package org.aboutyun;2.3. import java.io.BufferedReader;4. import java.io.FileNotFoundException;5. import java.io.FileReader;6. import java

31、.io.IOException;7.8. public class LineCounter 9. public static void main(String args) 10. try 11. BufferedReader reader = new BufferedReader(new FileReader(args0);12. char buffer = new char4096;13. int count;14. char last = 0;15. long line_count = 0;16. while(count = reader.read(buffer) = 0) 17. if

32、(count 0 20. 21.22. for (int i = 0; i count ; +i) 23. char c = bufferi;24. if (c = 0x02) 25. if (i = 0 28. else if (bufferi-1 = 0x01) 29. / normal one30. +line_count;31. 32. 33. 34.35. / keep the last one36. last = buffercount-1;37. 38.39. System.out.println(line_count);40. catch (FileNotFoundException e) 41. e.printStackTrace();42. catch (IOException e) 43. e.printStackTrace();44. 45. 46. 复制代码7.2 可以使用 Profiler 来对性能进行评估分析，比如 Eclipse 的 TPTP，或者 JProfiler。可以观察不同函数调用次数和以及占用时间，从而减少调用次数，以及优化函数内部。

展开阅读全文