统计文件信息:
$ /opt/cdh-5.3.6/hadoop-2.5.0/bin/hdfs dfs -text /user/hadoop/wordcount/input/wc.input
hadoop spark spark hadoop oracle mysql postgresql postgresql oracle mysql mysql mongodb hdfs yarn mapreduce yarn hdfs zookeeper针对于以上文件使用hive做词频统计:
create table docs (line string);
load data inpath '/user/hadoop/wordcount/input/wc.input' into table docs;
create table word_counts as
select word,count(1) as count from (select explode(split(line,' ')) as word from docs) word group by word order by word;分段解释:
--使用split函数对表中行按空格进行分隔:
select split(line,' ') from docs;
["hadoop","spark",""] ["spark","hadoop"] ["oracle","mysql","postgresql"] ["postgresql","oracle","mysql"] ["mysql","mongodb"] ["hdfs","yarn","mapreduce"] ["yarn","hdfs"] ["zookeeper"]--使用explode函数对split的结果集进行行拆列:
select explode(split(line,' ')) as word from docs;
word hadoop sparkspark
hadoop oracle mysql postgresql postgresql oracle mysql mysql mongodb hdfs yarn mapreduce yarn hdfs zookeeper--以上输出内容已经满足对其做统计分析,这时通过sql对其进行分析:
select word,count(1) as count from
(select explode(split(line,' ')) as word from docs) word group by word order by word;word count
1 hadoop 2 hdfs 2 mapreduce 1 mongodb 1 mysql 3 oracle 2 postgresql 2 spark 2 yarn 2 zookeeper 1