Partitioning and Bucketing in Hive
Partitioning
and Bucketing in Hive:
These two techniques are query
optimization techniques.
Partitioning:
· It is the way to organize the data in partitions by dividing the table
into different parts based on partition
keys.
· If there are repeated values in the any field or column then partition
technique can be applied.
· Each table in hive can have one or more partitions based on column or
partition key.
· Packages to be imported for partitoning
sethive.exec.dynamic.partition.mode=nonstrict
sethive.exec.dynamic.partition=
true
Bucketing:
· It is the technique which is used
for segregating hive tables into
multiple files.
· The divisionof hive tables or partitions
isbased on hash function of field which is present in table.
· Package for bucketing
sethive.enforce.bucketing=true;
Partitioned Using Bucketking
create table
txnByCat(txnno INT, txndate STRING, custno INT, amount DOUBLE,
product STRING, city
STRING, state STRING, spendby STRING)
partitioned by
(category STRING)
clustered by (state)
INTO 10 buckets
row format delimited
fields terminated by
','
stored as textfile;
set
hive.exec.dynamic.partition.mode=nonstrict;
set
hive.exec.dynamic.partition=true;
set
hive.enforce.bucketing=true;
from txn909 txn INSERT
OVERWRITE TABLE txnByCat PARTITION(category) select
txn.txnno,txn.txndate,txn.custno,txn.amount,txn.product,txn.city,txn.state,txn.spendby,
txn.category DISTRIBUTE BY category;
· SELECT txnno, product, state FROM txnsgusto12 TABLESAMPLE(BUCKET 5 OUT OF 10)
·
SELECT txnno, product,state,category
FROM txnsgusto12 TABLESAMPLE(BUCKET 4 OUT OF 5) LIMIT 10;
CUSTOMER TABLE (CUSTS)
CREATE EXTERNAL TABLE IF NOT EXISTS
custsrecord(cusId INT, name1 STRING, name2 STRING,age INT)
PARTITIONED BY(profession STRING)
CLUSTERED BY (age) INTO 5 BUCKETS
row
format delimited
fields terminated by ','
stored as textfile location
'/user/hive/warehouse/gusto/customer/';
DYNAMIC PARTITIONING
===============================================
create table txnBUC(txnno
INT, txndate STRING, custno INT, amount DOUBLE,
product STRING, city STRING,
state STRING, spendby STRING)
partitioned by (category
STRING)
clustered by (state) INTO 10
buckets
row format delimited
fields terminated by ','
stored as textfile;
set hive.exec.dynamic.partition.mode=nonstrict;
set
hive.exec.dynamic.partition=true;
set
hive.enforce.bucketing=true;
Set
hive.exec.mode.local.auto= true;
(Specially For Partition)
from txn909 txn INSERT
OVERWRITE TABLE txnBUC PARTITION(category) select
txn.txnno,txn.txndate,txn.custno,txn.amount,txn.product,txn.city,txn.state,txn.spendby,
txn.category DISTRIBUTE BY category;
CREATE EXTERNAL TABLE IF NOT
EXISTS custsrecord(cusId INT, name1 STRING, name2 STRING,age INT)
PARTITIONED BY(profession
STRING)
CLUSTERED BY (age) INTO 5
BUCKETS
row format delimited
fields terminated by ',';
from txn909 txn INSERT
OVERWRITE TABLE txnBUC PARTITION(category) select
txn.txnno,txn.txndate,txn.custno,txn.amount,txn.product,txn.city,txn.state,txn.spendby,
txn.category DISTRIBUTE BY category;
from customer cus INSERT
OVERWRITE TABLE custsrecord PARTITION(profession) select
cus.cusId,cus.name1,cus.name2,cus.age, cus.profession DISTRIBUTE BY profession;
from txnsgusto1 tINSERT
OVERWRITE TABLE txnsgusto12PARTITION(category)SELECT t.txnno, t.txndate,
t.custno, t.amount, t.product, t.city, t.state, t.spendby, t.category
DISTRIBUTED BY category;
CREATE EXTERNAL TABLE IF NOT
EXISTS CUST9(custno STRING, firstname STRING, lastname STRING,age INT)
PARTITIONED BY(profession
STRING)
CLUSTERED BY (age) INTO 5
BUCKETS
row format delimited
fields terminated by ','
STORED AS textfile;
from customer cus INSERT
OVERWRITE TABLE CUST9 PARTITION(profession) select cus.custno,cus.firstname,cus.lastname,cus.age,
cus.profession DISTRIBUTE BY profession;
custno string
firstname string
lastname string
age int
profession string
txnBUC (t) CUST9(c)
SELECT t.state, t.category,
t.txnno, c.firstname, c.age FROM txnBUC t JOIN CUST9 c on t.custno= c.custno
LIMIT 15;
SELECT t.product, t.category,
t.custno, c.firstname,c.custno, c.profession FROM txnBUC t LEFT OUTER JOIN CUST9 c on t.custno =
c.custno LIMIT 10;
set hive.auto.convert.join =
false;
Comments
Post a Comment