thiscodeWorks - Organizing the best of code online

Snippets Collections

Recent | Popular

List view

Change region in Hue.ini file

sudo sed -i '/region/s,us-east-1,us-west-2,' /etc/hue/conf/hue.ini

Change region in Hue.ini file

sudo sed -i '/region/s,us-east-1, us-west-2,' /etc/hue/conf/hue.ini

Uncomment Hue region parametr in Hue.ini

sudo sed -i '/region/s/^##//g' /etc/hue/conf/hue.ini

HA_Database_Setup_Init.pp Script

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

class hadoop_hive {

  class deploy ($roles) {

    if ('hive-client' in $roles) {
      include hadoop_hive::client
    }

    if ('hive-metastore-server' in $roles) {
      include hadoop_hive::metastore
    }

    if ('hive-server2' in $roles) {
      include hadoop_hive::server2
      if ('hive-metastore-server' in $roles) {
        Class['Hadoop_hive::Metastore_server'] -> Class['Hadoop_hive::Server']
      }
    }

    if ('hive-hbase' in $roles) {
      include hadoop_hive::hbase
    }

    # Need to make sure local mysql server is setup correctly (in case hive is
    # using it) before initializing the schema
    if ('hive-client' or 'hive-metastore-server' or 'hive-server2' in $roles) {
      if ('mysql-server' in $roles) {
        Class['Bigtop_mysql::Server'] -> Exec<| title == 'init hive-metastore schema' |>
      }
    }
  }

  class client_package {
    package { "hive":
      ensure => latest,
    }
  }

  class hive_keytab {
    include hadoop_hive::client_package
    require kerberos::client
    kerberos::host_keytab { "hive":
      spnego => true,
      require => Package["hive"],
    }
  }

  class common_config ($hbase_master = "",
                       $hbase_zookeeper_quorum = "",
                       $kerberos_realm = "",
                       $server2_thrift_port = "10000",
                       $server2_thrift_http_port = "10001",
                       $hive_execution_engine = "mr",
                       $metastore_server_uris = [],
                       $metastore_database_type = 'postgres',
                       $metastore_database_host = $fqdn,
                       $metastore_database_port = '5432',
                       $metastore_database_name = 'hive',
                       $metastore_database_user = 'postgres',
                       $metastore_database_password = 'root1234',
                       $hdfs_uri = undef,
                       $hive_env_overrides = {},
                       $hive_site_overrides = {},
                       $hive_log4j2_overrides = {},
                       $hive_exec_log4j2_overrides = {},
                       $hive_beeline_log4j2_overrides = {},
                       $hive_parquet_logging_overrides = {},
                       $hiveserver2_site_overrides = {},
                       $hive_llap_daemon_log4j2_overrides = {},
                       $user_log_dir = undef,
                       $java_tmp_dir = undef,
                       $use_dynamodb = false,
                       $use_aws_hm_client = false,
                       $use_emr_goodies = false,
                       $use_emr_s3_select = false,
                       $use_kinesis = false,
                       $use_hudi = false) {
    include hadoop_hive::client_package
    if ($kerberos_realm and $kerberos_realm != "") {
      include hadoop_hive::hive_keytab
    }

    $sticky_dirs = delete_undef_values([$java_tmp_dir, $user_log_dir])

    file { $sticky_dirs :
      ensure => "directory",
      owner  => "root",
      group  => "root",
      mode   => "1777",
      require => Package['hive']
    }

    if ($use_dynamodb) {
      include emr_ddb::library

      file { '/usr/lib/hive/auxlib/emr-ddb-hive.jar':
        ensure  => link,
        target  => '/usr/share/aws/emr/ddb/lib/emr-ddb-hive.jar',
        tag     => 'hive-aux-jar',
        require => [Package['emr-ddb'], Package['hive']]
      }
    }

    if ($use_aws_hm_client) {
      include aws_hm_client::library

      file { '/usr/lib/hive/auxlib/aws-glue-datacatalog-hive2-client.jar':
        ensure  => link,
        target  => '/usr/share/aws/hmclient/lib/aws-glue-datacatalog-hive2-client.jar',
        tag     => 'hive-aux-jar',
        require => [Package['aws-hm-client'], Package['hive']]
      }

      file { '/usr/lib/hive/auxlib/hive-openx-serde.jar':
        ensure  => link,
        target  => '/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar',
        tag     => 'hive-aux-jar',
        require => [Package['aws-hm-client'], Package['hive']]
      }
    }

    if ($use_emr_s3_select) {
      include emr_s3_select::library

      file { '/usr/lib/hive/auxlib/emr-s3-select-hive-connector.jar':
        ensure  => link,
        target  => '/usr/share/aws/emr/s3select/lib/emr-s3-select-hive-connector.jar',
        tag     => 'hive-aux-jar',
        require => [Package['emr-s3-select'], Package['hive']]
      }
    }

    if ($use_emr_goodies) {
      include emr_goodies::library

      file { '/usr/lib/hive/auxlib/emr-hive-goodies.jar':
        ensure  => link,
        target  => '/usr/share/aws/emr/goodies/lib/emr-hive-goodies.jar',
        tag     => 'hive-aux-jar',
        require => [Package['emr-goodies'], Package['hive']]
      }
    }

    if ($use_kinesis) {
      include emr_kinesis::library

      file { '/usr/lib/hive/auxlib/emr-kinesis-hive.jar':
        ensure  => link,
        target  => '/usr/share/aws/emr/kinesis/lib/emr-kinesis-hive.jar',
        tag     => 'hive-aux-jar',
        require => [Package['emr-kinesis'], Package['hive']]
      }
    }

    if ($use_hudi) {
      include hudi::library

      file { '/usr/lib/hive/auxlib/hudi-hadoop-mr-bundle.jar':
        ensure  => link,
        target  => '/usr/lib/hudi/hudi-hadoop-mr-bundle.jar',
        tag     => 'hive-aux-jar',
        require => [Package['hudi'], Package['hive']]
      }
    }

    $metastore_database_url = generate_metastore_url(
      $metastore_database_type,
      $metastore_database_host,
      $metastore_database_port,
      $metastore_database_name
    )
    $metastore_database_driver_class = get_metastore_driver_class($metastore_database_type)
    $metastore_database_schema_type = get_metastore_schema_type($metastore_database_type)

    bigtop_file::site { '/etc/hive/conf/hive-site.xml':
      content => template('hadoop_hive/hive-site.xml'),
      overrides => $hive_site_overrides,
      require => Package['hive'],
    }

    bigtop_file::site { '/etc/hive/conf/hiveserver2-site.xml':
      content => template('hadoop_hive/hiveserver2-site.xml'),
      overrides => $hiveserver2_site_overrides,
      require => Package['hive'],
    }

    bigtop_file::properties { '/etc/hive/conf/hive-log4j2.properties':
      content => template('hadoop_hive/hive-log4j2.properties'),
      overrides => $hive_log4j2_overrides,
      require => Package['hive'],
    }

    bigtop_file::properties { '/etc/hive/conf/hive-exec-log4j2.properties':
      source => '/etc/hive/conf.dist/hive-exec-log4j2.properties.default',
      overrides => $hive_exec_log4j2_overrides,
      require => Package['hive'],
    }

    bigtop_file::properties { '/etc/hive/conf/beeline-log4j2.properties':
      source => '/etc/hive/conf.dist/beeline-log4j2.properties.default',
      overrides => $hive_beeline_log4j2_overrides,
      require => Package['hive'],
    }

    bigtop_file::properties { '/etc/hive/conf/parquet-logging.properties':
      source => '/etc/hive/conf.dist/parquet-logging.properties.default',
      overrides => $hive_parquet_logging_overrides,
      require => Package['hive'],
    }

    bigtop_file::properties { '/etc/hive/conf/llap-daemon-log4j2.properties':
      source => '/etc/hive/conf.dist/llap-daemon-log4j2.properties.default',
      overrides => $hive_llap_daemon_log4j2_overrides,
      require => Package['hive'],
    }

    bigtop_file::env { '/etc/hive/conf/hive-env.sh':
      overrides => $hive_env_overrides,
      content => template('hadoop_hive/hive-env.sh'),
      require => Package['hive'],
    }
    

    include hadoop_hive::init_metastore_schema
  }

  class client($hbase_master = "",
      $hbase_zookeeper_quorum = "",
      $hive_execution_engine = "mr") {

      include hadoop_hive::common_config
  }

  class server2 {
    include hadoop_hive::common_config

    package { 'hive-server2':
      ensure => latest,
    }

    service { 'hive-server2':
      ensure    => running,
      require   => [Package['hive'], Package['hive-server2'], Class['Hadoop_hive::Init_metastore_schema']],
      subscribe => [Bigtop_file::Site['/etc/hive/conf/hive-site.xml'], Bigtop_file::Env['/etc/hive/conf/hive-env.sh']],
      hasrestart => true,
      hasstatus => true,
    }
    Kerberos::Host_keytab <| title == "hive" |> -> Service["hive-server2"]
    Service <| title == "hive-metastore" |> -> Service["hive-server2"]
    File <| tag == 'hive-aux-jar' |> -> Service['hive-server2']
    Bigtop_file::Env <| title == '/etc/hadoop/conf/hadoop-env.sh' |> ~> Service['hive-server2']
    Bigtop_file::Site <| tag == 'hadoop-plugin' or title == '/etc/hadoop/conf/core-site.xml' |> ~> Service['hive-server2']
  }

  class metastore {
    include hadoop_hive::common_config

    package { 'hive-metastore':
      ensure => latest,
    }

    service { 'hive-metastore':
      ensure    => running,
      require   => [Package['hive'], Package['hive-metastore'], Class['Hadoop_hive::Init_metastore_schema']],
      subscribe => [Bigtop_file::Site['/etc/hive/conf/hive-site.xml'], Bigtop_file::Env['/etc/hive/conf/hive-env.sh']],
      hasrestart => true,
      hasstatus => true,
    }
    Kerberos::Host_keytab <| title == "hive" |> -> Service["hive-metastore"]
    File <| title == "/etc/hadoop/conf/core-site.xml" |> -> Service["hive-metastore"]
    File <| tag == 'hive-aux-jar' |> -> Service['hive-metastore']
    Bigtop_file::Env <| title == '/etc/hadoop/conf/hadoop-env.sh' |> ~> Service['hive-metastore']
    Bigtop_file::Site <| tag == 'hadoop-plugin' or title == '/etc/hadoop/conf/core-site.xml' |> ~> Service['hive-metastore']
  }

  class database_connector {
    include hadoop_hive::common_config

    case $common_config::metastore_database_type {
      'mysql': {
        mysql_connector::link {'/usr/lib/hive/lib/mysql-connector-java.jar':
          require => Package['hive'],
         }
      }
      'mariadb': {
         mariadb_connector::link {'/usr/lib/hive/lib/mariadb-connector-java.jar':
          require => Package['hive']
         }
      }
        'postgres': {
         postgresql_connector::link {'/usr/lib/hive/lib/postgresql-9.4.1208.jre7.jar':
          require => Package['hive']
         }
      }
      'derby': {
        # do nothing
      }
      default: {
        fail("$common_config::metastore_database_type is not supported. Supported database types are ", $common_config::supported_database_types)
      }
    }
  }

  class init_metastore_schema($init_schema = true, $skip_init_schema = false) {

    include hadoop_hive::common_config
    include hadoop_hive::database_connector
    
    if (! $skip_init_schema) {
      if ($init_schema) {
        exec { 'init hive-metastore schema':
          command   => "/usr/lib/hive/bin/schematool -dbType postgres -initSchema -verbose",
          require   => [Package['hive'], Class['Hadoop_hive::Database_connector']],
          subscribe => [Bigtop_file::Site['/etc/hive/conf/hive-site.xml'], Bigtop_file::Env['/etc/hive/conf/hive-env.sh']],
          logoutput => true,
          unless    => "/usr/lib/hive/bin/schematool -dbType postgres -info",
          tries     => hiera('hadoop::ha', 'disabled') ? {"auto" => 10, default => 1},
          try_sleep => 5,
        }
      } else {
        exec { 'get hive-metastore info':
          command   => "/usr/lib/hive/bin/schematool -dbType postgres -info",
          require   => [Package['hive'], Class['Hadoop_hive::Database_connector']],
          subscribe => [Bigtop_file::Site['/etc/hive/conf/hive-site.xml'], Bigtop_file::Env['/etc/hive/conf/hive-env.sh']],
          logoutput => true,
          tries     => 120,
          try_sleep => 5,
        }
      }
    }
  }

  class hbase {
    package { 'hive-hbase':
      ensure => latest,
    }
  }
}

HA_Database_Setup_Configurations Json

[
    {
        "Classification": "hive-site",
        "Properties": {
            "javax.jdo.option.ConnectionUserName": "postgres",
            "javax.jdo.option.ConnectionDriverName": "org.postgresql.Driver",
            "javax.jdo.option.ConnectionPassword": "root1234",
            "javax.jdo.option.ConnectionURL": "jdbc:postgresql://database-1.cxv0eh6uhsan.us-east-1.rds.amazonaws.com:5432/hive_db"
        }
    },
    {
        "Classification": "hue-ini",
        "Properties": {},
        "Configurations": [
            {
                "Classification": "desktop",
                "Properties": {},
                "Configurations": [
                    {
                        "Classification": "database",
                        "Properties": {
                            "password": "root1234",
                            "engine": "postgres",
                            "port": "5432",
                            "host": "database-1.cxv0eh6uhsan.us-east-1.rds.amazonaws.com",
                            "name": "hue_db",
                            "user": "postgres"
                        }
                    }
                ]
            }
        ]
    },
    {
        "Classification": "oozie-site",
        "Properties": {
            "oozie.service.JPAService.jdbc.password": "root1234",
            "oozie.service.JPAService.jdbc.url": "jdbc:postgresql://database-1.cxv0eh6uhsan.us-east-1.rds.amazonaws.com:5432/oozie_db",
            "oozie.service.JPAService.jdbc.driver": "org.postgresql.Driver",
            "oozie.service.JPAService.jdbc.username": "postgres"
        }
    }
]

HA_Database_Setup_Bootstrap Script

#!/bin/bash
sudo yum install -y gcc python-setuptools python-devel postgresql-devel
sudo easy_install psycopg2
sudo python -m pip install psycopg2-binary
sudo yum install postgresql -y
PGPASSWORD=root1234 psql --username=postgres --host=database-1.cxv0eh6uhsan.us-east-1.rds.amazonaws.com --command='CREATE DATABASE hive_db'
PGPASSWORD=root1234 psql --username=postgres --host=database-1.cxv0eh6uhsan.us-east-1.rds.amazonaws.com --command='CREATE DATABASE oozie_db'
PGPASSWORD=root1234 psql --username=postgres --host=database-1.cxv0eh6uhsan.us-east-1.rds.amazonaws.com --command='create database hue_db with lc_collate="en_US.UTF-8"'
cd /var/aws/emr/bigtop-deploy/puppet/modules/hadoop_hive/manifests
File="init.pp"
if [ -f "$File" ]; then
mkdir /home/hadoop/code
sudo mv "$File" /home/hadoop/code/
aws s3 cp s3://nkityd-bucket/init.pp /home/hadoop/
sudo cp /home/hadoop/init.pp "$File"
fi
echo "Successful"

MR Java Wordcount

package com.amazonaws.emr.knet;

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class WordCount {

  public static class TokenizerMapper
       extends Mapper<Object, Text, Text, IntWritable>{

    private final static IntWritable one = new IntWritable(1);
    private Text word = new Text();

    public void map(Object key, Text value, Context context
                    ) throws IOException, InterruptedException {
    	
      StringTokenizer itr = new StringTokenizer(value.toString());
      while (itr.hasMoreTokens()) {
        word.set(itr.nextToken());
        context.write(word, one);
      }
    }
  }

  public static class IntSumReducer
       extends Reducer<Text,IntWritable,Text,IntWritable> {
    private IntWritable result = new IntWritable();

    public void reduce(Text key, Iterable<IntWritable> values,
                       Context context
                       ) throws IOException, InterruptedException {
      int sum = 0;
      for (IntWritable val : values) {
        sum += val.get();
      }
      result.set(sum);
      context.write(key, result);
    }
  }

  public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = new Job(conf,"word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
  }
}

Pom file for MR Java Wordcount

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  <modelVersion>4.0.0</modelVersion>
  <groupId>com.amazonaws.emr</groupId>
  <artifactId>knet</artifactId>
  <version>0.0.1-SNAPSHOT</version>
  <name>firstMapReduce</name>
  
   <properties>
    <maven-compiler-plugin.version>3.1</maven-compiler-plugin.version>
    <java.version>1.8</java.version>
    <hadoop.version>2.8.3</hadoop.version>
  </properties>
  <dependencies>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-common</artifactId>
      <version>${hadoop.version}</version>
      <scope>provided</scope>
    </dependency>
    <dependency>
      <groupId>org.apache.hadoop</groupId>
      <artifactId>hadoop-mapreduce-client-common</artifactId>
      <version>${hadoop.version}</version>
      <scope>provided</scope>
  </dependency>
</dependencies>
<build>
          <plugins>
            <plugin>
              <artifactId>maven-compiler-plugin</artifactId>
              <version>${maven-compiler-plugin.version}</version>
              <configuration>
                <source>${java.version}</source>
                <target>${java.version}</target>
              </configuration>
            </plugin>
    </plugins>
  </build>
  
</project>

Get Yarn Node details

yarn node -list -showDetails

Killing a YARN application

yarn application -kill application_id

Mapreduce Pi job

yarn jar /usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar pi 5 10

Tera Sort for EMR

hadoop jar /usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar terasort /nkityd/teragendata /nkityd/teragensorteddata/

Tera generate data for EMR

hadoop jar /usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar teragen 10000000 /nkityd/teragendata

Connect to mysql database for HMS

mysql -h ip-172-31-39-192.ec2.internal  -u hive -p

Connect to beeline

beeline -u "jdbc:hive2://localhost:10000/default" -n hdfs

#aws #emr #spark

Spark example job

"""
A simple example demonstrating basic Spark SQL features using fictional
data inspired by a paper on determining the optimum length of chopsticks.
https://www.ncbi.nlm.nih.gov/pubmed/15676839
Run with:
  ./bin/spark-submit OptimumChopstick.py
"""
from __future__ import print_function
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.storagelevel import StorageLevel
#rdd.persist(StorageLevel.MEMORY_ONLY_SER)

# Get avg Food pinching effeciency by length
def AvgEffeciencyByLength(df):
    meansDf = df.groupby('ChopstickLength').mean('FoodPinchingEffeciency').orderBy('avg(FoodPinchingEffeciency)',ascending=0)
    return meansDf

# init
spark = SparkSession.builder.appName("Optimum Chopstick").getOrCreate()
sc = spark.sparkContext
input_loc = "s3://llubbe-gdelt-open-data/ChopstickEffeciency/"

# Read input by line
lines = sc.textFile(input_loc)
parts = lines.map(lambda l: l.split(","))
parts.persist(StorageLevel.MEMORY_ONLY_SER)
# Each line is converted to a tuple.
chopstickItems = parts.map(lambda p: (str(p[0]), float(p[1]), int(p[2]), int(p[3].strip())))

# Define a schema
fields = [StructField("TestID", StringType()),
          StructField("FoodPinchingEffeciency", DoubleType()), 
          StructField("Individual", IntegerType()), 
          StructField("ChopstickLength", IntegerType())]
schema = StructType(fields)

# Apply the schema to the RDD
chopsticksDF = spark.createDataFrame(chopstickItems, schema)

effeciencyByLength = AvgEffeciencyByLength(chopsticksDF)
effeciencyByLength.distinct().count()

moar_chopsticksDF = spark.read.load(input_loc, format="csv", schema=schema)
moar_effeciencyByLength = AvgEffeciencyByLength(moar_chopsticksDF)
moar_effeciencyByLength.distinct().count()

spark.stop()

ssh -i ~/Documents/nkityd.pem hadoop@ec2-54-236-239-19.compute-1.amazonaws.com cat /etc/hive/conf/hive-site.xml | grep "thrift://" | sed 's/\<value>//g' | sed 's/\<\/value>//g' | awk '{print "HMS URI: " $1 }'

Killing a YARN application

yarn application -kill application_id

Run MR Pi example

yarn jar /usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar pi 5 10

Save snippets that work with our extensions

Available in the Chrome Web Store

Get Firefox Add-on

Get VS Code extension