Hive

概念

基于Hadoop的一个数据仓库工具(Data warehouse)
4个特点

面向主题的
集成的
相对稳定的
反应历史变化的
数据集合，用于支持管理决策

HDFS 为Hive提供存储
MapReduce 为Hive提供计算引擎
YARN 为Hive提供资源调度

作用

可以将可结构化的数据映射为一张数据库表, 并提供HQL(Hive QL)查询功能，底层数据存储在HDFS中
本质上是将Hql语句转化为mapReduce任务运行

通过类SQL来分析大数据，而避免了写 MapReduce 程序来分析数据，这样使得分析数据更容易

架构

元数据

存放数据描述的数据（用于映射表数据）
域(id)、表名和文件的对应关系(确认要查询那个数据表)、字段名和对应域值的关系(哪列数据)、分隔符(按照什么符号区分列数据)

存储介质：RDBMS (关系型数据库) 用关系型数据库存元数据信息。

数据存储

数据库

数据表

分区和分桶

表数据

库+表+分区 = HDFS上的一个目录
分区和分桶 = HDFS上的一个文件

基础操作示例

一、DDL

1、创建内部表
create table student(id int, name string, sex string, age int, department string) row format delimited fields terminated by “,”;

2、创建外部表

// 没有指定外部路径, 表的数据目录存储在默认的仓库路径中
create external table student_ext_1(id int, name string, sex string, age int, department string) row format delimited fields terminated by “,”;

// 指定一个不存在的外部路径: 创建表的时候，会自动给你创建表目录
create external table student_ext_2(id int, name string, sex string, age int, department string) row format delimited fields terminated by “,” location “/student_ext_2”;

// 指定一个已经存在的目录: 并且有数据
//在linux中执行
hadoop fs -mkdir -p /student_ext_3
hadoop fs -put /home/data/student.txt /student_ext_3
//在hive命令行中执行
create external table student_ext_3(id int, name string, sex string, age int, department string) row format delimited fields terminated by “,” location “/student_ext_3”;

3、创建分区表

// 创建只有一个分区字段的分区表：
create table student_ptn(id int, name string, sex string, age int, department string) partitioned by (city string comment “partitioned field”) row format delimited fields terminated by “,”;

load data local inpath “/home/data/student.txt” into table student_ptn; XXXXXXX

// 把数据导入到一个不存在的分区，它会自动创建该分区
load data local inpath “/home/data/student.txt” into table student_ptn partition(city=”beijing”); √√√√

// 把数据导入到一个已经存在的分区
alter table student_ptn add partition (city=”chongqing”);
load data local inpath “/home/data/student.tx” into table student_ptn partition(city=”chongqing”);

// 创建有多个分区字段的分区表：
create table student_ptn_date(id int, name string, sex string, age int, department string) partitioned by (city string comment “partitioned field”, dt string) row format delimited fields terminated by “,”;

// 往分区中导入数据:
load data local inpath “/home/data/student.txt” into table student_ptn_date partition(city=”beijing”); //报错

load data local inpath “/home/data/student.txt” into table student_ptn_date partition(city=”beijing”, dt=’2012-12-12’); //正确

// 不能在导入数据的时候指定多个分区定义
load data local inpath “/home/data/student.txt” into table student_ptn_date partition(city=”beijing”, dt=’2012-12-14’) partition (city=”beijing” , dt=’2012-12-13’); XXXXXX

// 添加分区
alter table student_ptn_date add partition(city=”beijing”, dt=’2012-12-14’) partition (city=”beijing” , dt=’2012-12-13’); √√√√√√√√
alter table student_ptn_date add partition(city=”chongqing”, dt=’2012-12-14’) partition (city=”chongqing” , dt=’2012-12-13’);

// 查询一个分区表有那些分区
show partitions student_ptn;
show partitions student_ptn_date;
show partitions student;

4、创建分桶表

// 创建一个分桶表
create table student_bucket (id int, name string, sex string, age int, department string) clustered by (department) sorted by (age desc, id asc) into 3 buckets row format delimited fields terminated by “,”;

5、从查询语句的结果创建新表
通过下面的命令：
create table … as select ….
查询例子：
select department, count() as total from student group by department;
完整的CTAS语句：
create table dpt_count as select department, count() as total from student group by department;

6、通过like复制已有表的结构创建新表
create table student_like like student;

删除表

drop table student;
drop table if exists student;

修改表

1、修改表名

alter table student rename to studentss;

2、修改字段

    添加字段： 
    alter table student add columns (city string, dt string);
    删除字段： 
    alter table student drop columns (city);   XXXXXXXX
    替换字段：
    alter table student replace columns (id int, name string, sex string, age int);
    改变列的定义：
    alter table student change id newid string comment "new id";
    改变列的顺序：
    alter table student change sex sex string first;
    alter table student change name name string after sex;

3、修改分区

    添加分区：
    alter table student_ptn add partition(city='tiajin') partition(city='shanghai');

    删除分区：
    alter table student_ptn drop partition(city='tiajin');  
    alter table student_ptn drop partition(city='tiajin'),partition(city='shanghai'); 

    修改分区的数据目录：
    alter table student_ptn partition(city="beijing") set location "/stu_beijing";   XXXX
    alter table student_ptn partition(city="beijing") set location "hdfs://hadoop02:9000/stu_beijing";  √√√√√√√

清空表：

truncate table student;
等价于下面：
hadoop fs -rm -r /user/hive/warehouse/myhive.db/student/*
前面的load和hadoop fs -put命令也是等价的。

显示创建表的详细信息

show create table student;

// 准备student表：以及数据：  
create database if not exists myhive;
use myhive;
create table student(id int, name string, sex string, age int, department string) row format delimited fields terminated by ",";
load data local inpath "/home/data/student.txt" into table student;
select * from student;

修改表

1、修改表名
alter table student rename to studentss;

2、修改字段

    添加字段： 
    alter table student add columns (city string, dt string);
    删除字段： 
    alter table student drop columns (city);   //报错XXXXXXXX
    替换字段：
    alter table student replace columns (id int, name string, sex string, age int);
    改变列的定义：
    alter table student change id newid string comment "new id";
    改变列的顺序：
    alter table student change sex sex string first;
    alter table student change name name string after sex;

3、修改分区

    添加分区：
    alter table student_ptn add partition(city='tiajin') partition(city='shanghai');

    删除分区：
    alter table student_ptn drop partition(city='tiajin');  
    alter table student_ptn drop partition(city='tiajin'),partition(city='shanghai'); 

    修改分区的数据目录：
    alter table student_ptn partition(city="beijing") set location "/stu_beijing";   XXXX
    alter table student_ptn partition(city="beijing") set location "hdfs://hadoop0:8020/stu_beijing";  √√√√√√√

二、DML

1、导入数据

load
load data local inpath “/home/data/student.txt” into table student;

insert
insert into table student (id, name, sex, age, department) values (101,”huangbo”,”M”,222,”IT”);

创建分区表：

create table student_ptn (id int, name string, sex string, age int) partitioned by (department string) row format delimited fields terminated by “,”;

单重插入：
insert into table student_ptn partition (department = ‘IS’) select id,sex,name,age from student where department = ‘IS’;
insert into table student_ptn partition (department = ‘CS’) select id,sex,name,age from student where department = ‘CS’;
insert into table student_ptn partition (department = ‘MA’) select id,sex,name,age from student where department = ‘MA’;

多重插入：
from student
insert into table student_ptn partition (department = ‘IS’) select id,sex,name,age where department = ‘IS’
insert into table student_ptn partition (department = ‘CS’) select id,sex,name,age where department = ‘CS’
insert into table student_ptn partition (department = ‘MA’) select id,sex,name,age where department = ‘MA’

2、导出数据

insert overwrite local directory "/home/data/cs_student" select * from student where department = 'CS';

insert overwrite directory "/home/data/cs_student" select * from student where department = 'CS';

3、查询

select ... from ...  join   ... on... where  group by  having  limit