awk -F’\t’ ‘BEGIN{initialization}{for every line}END{postprocessing}’ inputfile
-
Recent Posts
Archives
Categories
Meta
awk -F’\t’ ‘BEGIN{initialization}{for every line}END{postprocessing}’ inputfile
ssh tunnel
1. generate ssh-key and upload public key to server, confirm passwordly ssh
2. ssh -f shawn@xxx.com -L 11111:xxx.com:222222 -N
3. connect to localhost:11111
ssh passwordless
ssh-keygen
ssh-copy-id shawn@xxx.xxx.xxx.xxx
# commit all updates (not add new)
git commit -a -m”commit message”
#pull and overwrite local uncommitted changes
git fetch –all
git reset –hard origin/master
if you want to enable assertions/annotations/generics/for-each (-source 1.6) and also want the compiled classes to be compatible with JVM 1.6 (-target 1.6)
<plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>2.5.1</version> <configuration> <source>1.6</source> <target>1.6</target> </configuration> </plugin>
if you want to generate a jar with dependencies
<plugin> <artifactId>maven-assembly-plugin</artifactId> <configuration> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> </configuration> </plugin>
mvn assembly:assembly
counting distinct values is easy as:
db.my_collection.distinct(_id)
but counting substrings of the values is not
var cnt=new Object()
db.my_collection.find({_id:{$exists:true}}).forEach(function(doc){if(doc._id != null && typeof doc._id == “string”) cnt[doc._id.substring(0,doc._id.indexOf(‘:’))]=1;})
var size=0;
for(key in cnt) {if (cnt.hasOwnProperty(key)) size++;};
#breiman example
hadoop jar $MAHOUT_HOME/core/target/mahout-core-0.7-SNAPSHOT-job.jar org.apache.mahout.classifier.df.tools.Describe -p testdata/glass.data -f testdata/glass.info -d I 9 N L
hadoop jar $MAHOUT_HOME/examples/target/mahout-examples-0.7-SNAPSHOT-job.jar org.apache.mahout.classifier.df.BreimanExample -d testdata/glass.data -ds testdata/glass.info -i 10 -t 100
#partial implementation example
hadoop jar $MAHOUT_HOME/core/target/mahout-core-0.7-SNAPSHOT-job.jar org.apache.mahout.classifier.df.tools.Describe -p testdata/KDDTrain+.arff -f testdata/KDDTrain+.info -d N 3 C 2 N C 4 N C 8 N 2 C 19 N L
hadoop jar $MAHOUT_HOME/examples/target/mahout-examples-0.7-SNAPSHOT-job.jar org.apache.mahout.classifier.df.mapreduce.BuildForest -Dmapred.max.split.size=1874231 -oob -d testdata/KDDTrain+.arff -ds testdata/KDDTrain+.info -sl 5 -p -t 100
hadoop jar $MAHOUT_HOME/examples/target/mahout-examples-0.7-SNAPSHOT-job.jar org.apache.mahout.classifier.df.mapreduce.TestForest -i testdata/KDDTest+.arff -ds testdata/KDDTrain+.info -m ob -a -mr -o predictions
hadoop jar $MAHOUT_HOME/examples/target/mahout-examples-0.7-SNAPSHOT-job.jar org.apache.mahout.classifier.df.tools.ForestVisualizer -ds testdata/KDDTrain+.info -m ob
1. start the master server (ubuntu in this example) and add username/group
2. install Java 1.6 (Sun JDK)
3. install CDH3 (namenode, secondarynamenode, jobtracker, datanode, tasktracker, pig,…)
4. config CDH3
sudo cp -r /etc/hadoop-0.20/conf.empty /etc/hadoop-0.20/conf.cluster sudo update-alternatives --install /etc/hadoop-0.20/conf hadoop-0.20-conf /etc/hadoop-0.20/conf.cluster 50 (make sure it is the largest number, tie causes problem) sudo nano /etc/hadoop-0.20/conf.cluster/core-site.xml <configuration> <property> <name>fs.default.name</name> <value>hdfs://master:8020</value> </property> </configuration> sudo nano /etc/hadoop-0.20/conf.cluster/hdfs-site.xml <configuration> <property> <name>dfs.replication</name> <value>3</value> </property> <property> <name>dfs.name.dir</name> <value>/var/opt/cdh3/cluster/dfs/nn</value> </property> <property> <name>dfs.data.dir</name> <value>/var/opt/cdh3/cluster/dfs/dn</value> </property> </configuration> sudo nano /etc/hadoop-0.20/conf.cluster/mapred-site.xml <configuration> <property> <name>mapred.job.tracker</name> <value>master:8021</value> </property> <property> <name>mapred.local.dir</name> <value>/var/opt/cdh3/cluster/mapred/local</value> </property> <property> <name>mapred.system.dir</name> <value>/mapred/system</value> </property> </configuration> sudo mkdir -p /var/opt/cdh3/cluster/dfs/nn sudo mkdir -p /var/opt/cdh3/cluster/dfs/dn sudo mkdir -p /var/opt/cdh3/cluster/mapred/local sudo chown -R hdfs:hadoop /var/opt/cdh3/cluster/dfs sudo chmod 700 /var/opt/cdh3/cluster/dfs/nn sudo chown -R mapred:hadoop /var/opt/cdh3/cluster/mapred sudo chmod 755 /var/opt/cdh3/cluster/mapred
5. make an image of the master
6. start slave1, slave2, slave3 from the image
7. update /etc/hosts on all machines to include hostnames for master and slaves
8. update /etc/hadoop-0.20/conf.cluster/masters and /etc/hadoop-0.20/conf.cluster/slaves on all machines
9. format namenode on master
sudo -u hdfs hadoop namenode -format
10. start namenode and secondarynamenode on master
sudo service hadoop-0.20-namenode start sudo service hadoop-0.20-secondarynamenode start
11. start datanode on slaves
sudo service hadoop-0.20-datanode start
12. create HDFS /tmp directory
sudo -u hdfs hadoop fs -mkdir /tmp sudo -u hdfs hadoop fs -chmod -R 1777 /tmp sudo -u hdfs hadoop fs -mkdir /mapred/system sudo -u hdfs hadoop fs -chown mapred:hadoop /mapred/system
13. start tasktracker on slaves
sudo service hadoop-0.20-tasktracker start
14. start jobtracker on master
sudo service hadoop-0.20-jobtracker start
15. add user
sudo -u hdfs hadoop fs -chown hdfs:hadoop / sudo -u hdfs hadoop fs -mkdir /user sudo -u hdfs hadoop fs -mkdir /user/shawn sudo -u hdfs hadoop fs -chown shawn:hadoop /user/shawn