Partitioning and Bucketing in Hive

Partitioning and Bucketing in Hive:

These two techniques are query optimization techniques.

Partitioning: 

·       It is the way to organize the data in partitions by dividing the table into different parts  based on partition keys.

·       If there are repeated values in the any field or column then partition technique can be applied.

·       Each table in hive can have one or more partitions based on column or partition key.

·       Packages to be imported for partitoning

sethive.exec.dynamic.partition.mode=nonstrict

sethive.exec.dynamic.partition= true

Bucketing:

·       It is the technique  which is used for segregating hive tables into  multiple files.

·       The divisionof hive tables or partitions  isbased on hash function of field which is present in table.

·       Package for bucketing

sethive.enforce.bucketing=true;

 

Partitioned Using Bucketking

create table txnByCat(txnno INT, txndate STRING, custno INT, amount DOUBLE,

product STRING, city STRING, state STRING, spendby STRING)

partitioned by (category STRING)

clustered by (state) INTO 10 buckets

row format delimited

fields terminated by ','

stored as textfile;

set hive.exec.dynamic.partition.mode=nonstrict;

set hive.exec.dynamic.partition=true;

set hive.enforce.bucketing=true;

from txn909 txn INSERT OVERWRITE TABLE txnByCat PARTITION(category) select txn.txnno,txn.txndate,txn.custno,txn.amount,txn.product,txn.city,txn.state,txn.spendby, txn.category DISTRIBUTE BY category;

·       SELECT txnno, product, state FROM  txnsgusto12 TABLESAMPLE(BUCKET 5 OUT OF 10)

·       SELECT txnno, product,state,category FROM txnsgusto12 TABLESAMPLE(BUCKET 4 OUT OF 5) LIMIT 10;

 

CUSTOMER TABLE (CUSTS)

CREATE EXTERNAL TABLE IF NOT EXISTS custsrecord(cusId INT, name1 STRING, name2 STRING,age INT)

PARTITIONED BY(profession STRING)

CLUSTERED BY (age) INTO 5 BUCKETS

row  format delimited

fields terminated by ','

stored as textfile location '/user/hive/warehouse/gusto/customer/';

 

 

DYNAMIC PARTITIONING

===============================================

 

create table txnBUC(txnno INT, txndate STRING, custno INT, amount DOUBLE,

product STRING, city STRING, state STRING, spendby STRING)

partitioned by (category STRING)

clustered by (state) INTO 10 buckets

row format delimited

fields terminated by ','

stored as textfile;

 

 

set hive.exec.dynamic.partition.mode=nonstrict;

set hive.exec.dynamic.partition=true;

set hive.enforce.bucketing=true;

 

 

 

 

Set hive.exec.mode.local.auto= true;

   (Specially For Partition)

 

 

from txn909 txn INSERT OVERWRITE TABLE txnBUC PARTITION(category) select txn.txnno,txn.txndate,txn.custno,txn.amount,txn.product,txn.city,txn.state,txn.spendby, txn.category DISTRIBUTE BY category;

 

 

 

 

 

 

 

 

CREATE EXTERNAL TABLE IF NOT EXISTS custsrecord(cusId INT, name1 STRING, name2 STRING,age INT)

PARTITIONED BY(profession STRING)

CLUSTERED BY (age) INTO 5 BUCKETS

row  format delimited

fields terminated by ',';

 

from txn909 txn INSERT OVERWRITE TABLE txnBUC PARTITION(category) select txn.txnno,txn.txndate,txn.custno,txn.amount,txn.product,txn.city,txn.state,txn.spendby, txn.category DISTRIBUTE BY category;

 

from customer cus INSERT OVERWRITE TABLE custsrecord PARTITION(profession) select cus.cusId,cus.name1,cus.name2,cus.age, cus.profession DISTRIBUTE BY profession;

 

from txnsgusto1 tINSERT OVERWRITE TABLE txnsgusto12PARTITION(category)SELECT t.txnno, t.txndate, t.custno, t.amount, t.product, t.city, t.state, t.spendby, t.category DISTRIBUTED BY category;

 

 

 

 

 

CREATE EXTERNAL TABLE IF NOT EXISTS CUST9(custno STRING, firstname STRING, lastname STRING,age INT)

PARTITIONED BY(profession STRING)

CLUSTERED BY (age) INTO 5 BUCKETS

row  format delimited

fields terminated by ','

STORED AS textfile;

 

 

 

from customer cus INSERT OVERWRITE TABLE CUST9 PARTITION(profession) select cus.custno,cus.firstname,cus.lastname,cus.age, cus.profession DISTRIBUTE BY profession;

 

 

 

 

 

custno                  string

firstname               string

lastname                string

age                     int

profession              string

 

 

txnBUC (t)           CUST9(c)

 

 

SELECT t.state, t.category, t.txnno, c.firstname, c.age FROM txnBUC t JOIN CUST9 c on t.custno= c.custno LIMIT 15;

 

SELECT t.product, t.category, t.custno, c.firstname,c.custno, c.profession FROM txnBUC t  LEFT OUTER JOIN CUST9 c on t.custno = c.custno LIMIT 10;

 

 

set hive.auto.convert.join = false;


Comments

Popular posts from this blog

Hadoop

Problem Statement Of Real Estate Use Cases

Problem Statement Of Bank Marketing analysis