web search bench marks Nutch indexing:大规模收索引擎,这个是负载测试nutch(apache的一个开源搜索引擎)的搜索子系统,使用自动生成的web数据,web数据中的连接和单词符合zipfian分布(一个单词出现的次数与它在频率表的排名成反比) Pagerank:这个负载包含在一种在hadoop上的pagerank的算法实现,使用自动生成的web数据,web数据中的链接符合zipfian分布。(对于任意一个term其频度(frequency)的排名(rank)和frequency的乘积大致是一个常数)
data analytics benchmarks Hive query benchmarks(hivebench):包含执行的典型olap查询的hive查询(aggregation和join),使用自动生成的web数据,web数据的链接符合zipfian分布。
注:使用的生成数据程序在hadoop-mapreduce-examples-2.6.0 jar 包内,可以使用反编译工具查看。
2.HiBench中bayes算法流程
主要流程为conf下配置测试项,测试语言和DataSize,然后运行bin下run-all.sh完成一次测试,此流程为手动完成,可以编写脚本重复此步骤完成多次测试减少手动操作; e.g.
#!/bin/bash
# Time: 20160930,created by sunfei# Describe: automatic run the hibench# Functions :# search(): Find the style of application in the 99-user_defined_properties.conf,eg:tiny,small..# exec_application_noSQL(): run the application for times,and no use hive# exec_application_SQL(): run the application for times,and use hive# save_result(): save the result of application# main_function(): the main function of running all the appliction# main(): the main function of running different kind applicationcpuLoad()
{
cpu=`grep -c 'model name' /proc/cpuinfo`
load_15=`uptime | awk '{print $NF}'`
average_load=`echo"scale=2;a=${load_15}/${cpu};if(length(a)==scale(a)) print 0;print a" | bc`
date >> datetime-load.txt
${average_load} >> cpu-load.txt
paste datetime-load.txt cpu-load.txt >> load-day.txt
}
search()
{
#config="/opt/HiBench/HiBench-master/conf/99-user_defined_properties.conf"
config=/usr/HiBench-master/conf/99-user_defined_properties.conf
sed -n '/hibench.scale.profile/p'${config} >> hibench.txt
var=''whileread line
doif [ ${line:0:13} = "hibench.scale" ];thenecho-e"\033[32m match sucessfull! \033[0m"
var=${line:22}fidone<"hibench.txt"if [ "$var" = "${1}" ];thenecho-e"\033[31m The style of application can't same,do you want to continue? yes | no \033[0m"read -p "Input your chose :" chose
if [ "${chose}" = "no" ];thenexit1elseecho-e"\033[32m The ${1} style of application will be run! \033[0m"fifiif [ -f"hibench.txt" ];then
rm -rf "hibench.txt"echo-e"\033[32m The hibench.txt has deleted! \033[0m"fiecho-e"\033[32m The application will run the "${1}" style \033[0m"
sed -i "s/${var}/${1}/"${config}
}
exec_application_noSQL()
{
var=0for ((i=1;i<=${1};i++))
dolet"var=$i%1"if [ "$var"-eq0 ];then
hadoop fs -rm -r hdfs://archive.cloudera.com:8020/user/hdfs/.Trash/*
hadoop fs -rm -r hdfs://archive.cloudera.com:8020/HiBench/*
fiecho-e"\033[32m **********************The current times is ********************:\033[0m"${i}#/opt/HiBench/HiBench-master/bin/run-all.sh
/usr/HiBench-master/bin/run-all.sh
echo-e"\033[32m ********************** The current time is "${i}" ,and it has exec finished successfully! ********************:\033[0m"doneecho-e"\033[32m *********The application has finished,please modify the configuration!***** \033[0m"
}
exec_application_SQL()
{
var=0for ((i=1;i<=${1};i++))
doecho"drop table uservisits;drop table uservisits_aggre;drop table rankings;drop table rankings_uservisits_join;drop table uservisits_copy;exit;" | /usr/bin/hive
let"var=$i%1"if [ "$var"-eq0 ];then
hadoop fs -rm -r hdfs://archive.cloudera.com:8020/user/hdfs/.Trash/*
hadoop fs -rm -r hdfs://archive.cloudera.com:8020/HiBench/*
fiecho-e"\033[32m **********************The current times is ********************:\033[0m"${i}#/opt/HiBench/HiBench-master/bin/run-all.sh
/usr/HiBench-master/bin/run-all.sh
echo-e"\033[32m **********************The current time is "${i}" ,and it has exec finished successfully! ********************:\033[0m"doneecho-e"\033[32m *********The application has finished,please modify the configuration!***** \033[0m"
}
save_result()
{
if [ -f result.txt ];then
rm -rf result.txt
echo-e"\033[32m The hibench.txt has deleted! \033[0m"fi#select the words in the report#filepath=/opt/HiBench/HiBench-master/report/hibench.report
filepath=/usr/HiBench-master/report/hibench.report
word=""
var1=`date +"%m/%d/%Y-%k:%M:%S"`
var2=${1}
var5=".txt"
var4=${var2}${var5}case${1}in"aggregation")
word="JavaSparkAggregation"
;;
"join")
word="JavaSparkJoin"
;;
"scan")
word="JavaSparkScan"
;;
"kmeans")
word="JavaSparkKmeans"
;;
"pagerank")
word="JavaSparkPagerank"
;;
"sleep")
word="JavaSparkSleep"
;;
"sort")
word="JavaSparkSort"
;;
"wordcount")
word="JavaSparkWordcount"
;;
"bayes")
word="JavaSparkBayes"
;;
"terasort")
word="JavaSparkTerasort"
;;
*)
echo-e"\033[32m The name of application is wrong,please change it! \033[0m"
;;
esacwhileread line
doecho$line | sed -n "/${word}/p" >> ${var4}done <$filepathecho-e"\033[32m The job has finished! \033[0m"
}
main_function()
{
#Input the name of application need to execfor appName in aggregation join scan pagerank sleep sort wordcount bayes terasort kmeans
do#appConfig=/opt/HiBench/HiBench-master/conf/benchmarks.lst
appConfig=/usr/HiBench-master/conf/benchmarks.lst
echo"The name of application is :"${appName}echo${appName} > ${appConfig}for style in tiny small large huge gigantic
do
search ${style}if [ "aggregation" = ${appName} ] || [ "join" = ${appName} ] || [ "scan" = ${appName} ];thenexec_application_SQL ${1}elseexec_application_noSQL ${1}fidone
save_result ${appName}done
}
main()
{
# run the applicationread -p "Input the times of exec: " times
if [ "${times}"-eq0 -o "${times}"-gt60 ];thenecho-e"\033[31m The times of application can't be empty or gt 60 ! Do you want to continue ? yes | no\033[0m"read -p "Input your chose :" chose
if [ "${chose}" = "no" ];thenexit1elseecho-e"\033[32m The application will be run ${times} times ! \033[0m"fifiecho-e"\033[33m Select the style of application : \033[0m \033[31m All | Signal \033[0m"read -p "Input your chose :" style
if [ "${style}" = "" ];thenecho-e"\033[31m The style of application can't be empty \033[0m"exit1elif [ "${style}" != "All"-a"${style}" != "Signal" ];thenecho-e"\033[31m The style of application is wrong,please correct! \033[0m"exit1elseecho-e"\033[32m The style of application is ok ! \033[0m"fiif [ "All" = "${style}" ];then
main_function ${times}elseecho-e"\033[033m Input the name of apliaction,eg:\033[0m \033[31m aggregation | join | scan | kmeans | pagerank | sleep | sort | wordcount | bayes | terasort\033[0m"read -p "Input you chose :" application
if [ "${application}" = "" ];thenecho-e"\033[31m The name of application can't be empty! \033[0m"exit1fiecho"********************The ${application} will be exec**********************"
appConfig=/usr/HiBench-master/conf/benchmarks.lst
#appConfig=/opt/HiBench/HiBench-master/conf/benchmarks.lstread -p "Do you want exec all the style of application,eg:tiny,small,large,huge,gigantic? yes | no " chose
if [ "${chose}" = "" ];thenecho-e"\033[31m The style of application can't be empty! \033[0m"exit1elif [ "yes" != ${chose} ] && [ "no" != ${chose} ];thenecho-e"\033[31m The style of application is wrong,please correct! \033[0m"exit1elseecho-e"\033[32m The style of application is ok ! \033[0m"firead -p "Input the sytle of application,eg:( tiny small large huge gigantic )!" appStyle
echo"***************************The ${appStyle} style will be exec***************************"for appName in${application}doecho${appName} > ${appConfig}if [ "yes" = "${chose}" ];thenfor var in tiny small large huge gigantic
doecho"******************The ${appName} will be exec!************************************"
search ${var}if [ "aggregation" = ${appName} ] || [ "join" = ${appName} ] || [ "scan" = ${appName} ];thenexec_application_SQL ${times}elseexec_application_noSQL ${times}fidoneelse# read -p "Input the sytle of application,eg:( tiny small large huge gigantic )!" appStyleecho"**************************The ${appName} will be exec!************************"if [ "${appStyle}" = "" ];thenecho-e"\033[31m The style of application can't be empty! \033[0m"exit1fifor var in${appStyle}do
search ${var}if [ "aggregation" = ${appName} ] || [ "join" = ${appName} ] || [ "scan" = ${appName} ];thenexec_application_SQL ${times}elseexec_application_noSQL ${times}fidonefi
save_result ${appName}donefi
}
# the main function of application
main
## Licensed to the Apache Software Foundation (ASF) under one or more# contributor license agreements. See the NOTICE file distributed with# this work for additional information regarding copyright ownership.# The ASF licenses this file to You under the Apache License, Version 2.0# (the "License"); you may not use this file except in compliance with# the License. You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License.#"""
A naive bayes program using MLlib.
This example requires NumPy (http://www.numpy.org/).
"""import sys
from pyspark import SparkContext
from pyspark.mllib.util import MLUtils
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors
from pyspark.storagelevel import StorageLevel
from operator import add
from itertools import groupby
## Adopted from spark's doc: http://spark.apache.org/docs/latest/mllib-naive-bayes.html#defparseVector(line):return np.array([float(x) for x in line.split(' ')])
if __name__ == "__main__":
if len(sys.argv) != 2:
print >> sys.stderr, "Usage: bayes <file>"
exit(-1)
sc = SparkContext(appName="PythonNaiveBayes")
filename = sys.argv[1]
data = sc.sequenceFile(filename, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
wordCount = data \
.flatMap(lambda (key, doc):doc.split(" ")) \
.map(lambda x:(x, 1)) \
.reduceByKey(add)
wordSum = wordCount.map(lambda x:x[1]).reduce(lambda x,y:x+y)
wordDict = wordCount.zipWithIndex() \
.map(lambda ((key, count), index): (key, (index, count*1.0 / wordSum)) ) \
.collectAsMap()
sharedWordDict = sc.broadcast(wordDict)
# for each document, generate vector based on word freqdefdoc2vector(dockey, doc):# map to word index: freq# combine freq with same word
docVector = [(key, sum((z[1] for z in values))) for key, values in
groupby(sorted([sharedWordDict.value[x] for x in doc.split(" ")],
key=lambda x:x[0]),
key=lambda x:x[0])]
(indices, values) = zip(*docVector) # unzip
label = float(dockey[6:])
return label, indices, values
vector = data.map( lambda (dockey, doc) : doc2vector(dockey, doc))
vector.persist(StorageLevel.MEMORY_ONLY)
d = vector.map( lambda (label, indices, values) : indices[-1] if indices else0)\
.reduce(lambda a,b:max(a,b)) + 1# print "###### Load svm file", filename#examples = MLUtils.loadLibSVMFile(sc, filename, numFeatures = numFeatures)
examples = vector.map( lambda (label, indices, values) : LabeledPoint(label, Vectors.sparse(d, indices, values)))
examples.cache()
# FIXME: need randomSplit!
training = examples.sample(False, 0.8, 2)
test = examples.sample(False, 0.2, 2)
numTraining = training.count()
numTest = test.count()
print" numTraining = %d, numTest = %d." % (numTraining, numTest)
model = NaiveBayes.train(training, 1.0)
model_share = sc.broadcast(model)
predictionAndLabel = test.map( lambda x: (x.label, model_share.value.predict(x.features)))
# prediction = model.predict(test.map( lambda x: x.features ))# predictionAndLabel = prediction.zip(test.map( lambda x:x.label ))
accuracy = predictionAndLabel.filter(lambda x: x[0] == x[1]).count() * 1.0 / numTest
print"Test accuracy = %s." % accuracy