Hive
概念
基于Hadoop的一个数据仓库工具(Data warehouse)
4个特点
- 面向主题的
- 集成的
- 相对稳定的
- 反应历史变化的
数据集合,用于支持管理决策
HDFS 为Hive提供存储
MapReduce 为Hive提供计算引擎
YARN 为Hive提供资源调度
作用
可以将可结构化的数据映射为一张数据库表, 并提供HQL(Hive QL)查询功能,底层数据存储在HDFS中
本质上是 将Hql语句转化为mapReduce任务运行
通过类SQL
来分析大数据,而避免了写 MapReduce 程序来分析数据,这样使得分析数据更容易
架构
元数据
存放数据描述的数据(用于映射表数据 )
域(id)、表名和文件的对应关系(确认要查询那个数据表)、字段名和对应域值的关系(哪列数据)、分隔符(按照什么符号区分列数据)
存储介质:RDBMS (关系型数据库) 用关系型数据库存元数据信息。
数据存储
数据库
数据表
分区和分桶
表数据
库+表+分区 = HDFS上的一个目录
分区和分桶 = HDFS上的一个文件
基础操作示例
一、DDL
1、创建内部表
create table student(id int, name string, sex string, age int, department string) row format delimited fields terminated by “,”;
2、创建外部表
// 没有指定外部路径, 表的数据目录存储在默认的仓库路径中
create external table student_ext_1(id int, name string, sex string, age int, department string) row format delimited fields terminated by “,”;
// 指定一个不存在的外部路径: 创建表的时候,会自动给你创建表目录
create external table student_ext_2(id int, name string, sex string, age int, department string) row format delimited fields terminated by “,” location “/student_ext_2”;
// 指定一个已经存在的目录: 并且有数据
//在linux中执行
hadoop fs -mkdir -p /student_ext_3
hadoop fs -put /home/data/student.txt /student_ext_3
//在hive命令行中执行
create external table student_ext_3(id int, name string, sex string, age int, department string) row format delimited fields terminated by “,” location “/student_ext_3”;
3、创建分区表
// 创建只有一个分区字段的分区表:
create table student_ptn(id int, name string, sex string, age int, department string) partitioned by (city string comment “partitioned field”) row format delimited fields terminated by “,”;
load data local inpath “/home/data/student.txt” into table student_ptn; XXXXXXX
// 把数据导入到一个不存在的分区,它会自动创建该分区
load data local inpath “/home/data/student.txt” into table student_ptn partition(city=”beijing”); √√√√
// 把数据导入到一个已经存在的分区
alter table student_ptn add partition (city=”chongqing”);
load data local inpath “/home/data/student.tx” into table student_ptn partition(city=”chongqing”);
// 创建有多个分区字段的分区表:
create table student_ptn_date(id int, name string, sex string, age int, department string) partitioned by (city string comment “partitioned field”, dt string) row format delimited fields terminated by “,”;
// 往分区中导入数据:
load data local inpath “/home/data/student.txt” into table student_ptn_date partition(city=”beijing”); //报错
load data local inpath “/home/data/student.txt” into table student_ptn_date partition(city=”beijing”, dt=’2012-12-12’); //正确
// 不能在导入数据的时候指定多个分区定义
load data local inpath “/home/data/student.txt” into table student_ptn_date partition(city=”beijing”, dt=’2012-12-14’) partition (city=”beijing” , dt=’2012-12-13’); XXXXXX
// 添加分区
alter table student_ptn_date add partition(city=”beijing”, dt=’2012-12-14’) partition (city=”beijing” , dt=’2012-12-13’); √√√√√√√√
alter table student_ptn_date add partition(city=”chongqing”, dt=’2012-12-14’) partition (city=”chongqing” , dt=’2012-12-13’);
// 查询一个分区表有那些分区
show partitions student_ptn;
show partitions student_ptn_date;
show partitions student;
4、创建分桶表
// 创建一个分桶表
create table student_bucket (id int, name string, sex string, age int, department string) clustered by (department) sorted by (age desc, id asc) into 3 buckets row format delimited fields terminated by “,”;
5、从查询语句的结果创建新表
通过下面的命令:
create table … as select ….
查询例子:
select department, count() as total from student group by department;
完整的CTAS语句:
create table dpt_count as select department, count() as total from student group by department;
6、通过like复制已有表的结构创建新表
create table student_like like student;
删除表
drop table student;
drop table if exists student;
修改表
1、修改表名
alter table student rename to studentss;
2、修改字段
添加字段:
alter table student add columns (city string, dt string);
删除字段:
alter table student drop columns (city); XXXXXXXX
替换字段:
alter table student replace columns (id int, name string, sex string, age int);
改变列的定义:
alter table student change id newid string comment "new id";
改变列的顺序:
alter table student change sex sex string first;
alter table student change name name string after sex;
3、修改分区
添加分区:
alter table student_ptn add partition(city='tiajin') partition(city='shanghai');
删除分区:
alter table student_ptn drop partition(city='tiajin');
alter table student_ptn drop partition(city='tiajin'),partition(city='shanghai');
修改分区的数据目录:
alter table student_ptn partition(city="beijing") set location "/stu_beijing"; XXXX
alter table student_ptn partition(city="beijing") set location "hdfs://hadoop02:9000/stu_beijing"; √√√√√√√
清空表:
truncate table student;
等价于下面:
hadoop fs -rm -r /user/hive/warehouse/myhive.db/student/*
前面的load和hadoop fs -put命令也是等价的。
显示创建表的详细信息
show create table student;
// 准备student表:以及数据:
create database if not exists myhive;
use myhive;
create table student(id int, name string, sex string, age int, department string) row format delimited fields terminated by ",";
load data local inpath "/home/data/student.txt" into table student;
select * from student;
修改表
1、修改表名
alter table student rename to studentss;
2、修改字段
添加字段:
alter table student add columns (city string, dt string);
删除字段:
alter table student drop columns (city); //报错XXXXXXXX
替换字段:
alter table student replace columns (id int, name string, sex string, age int);
改变列的定义:
alter table student change id newid string comment "new id";
改变列的顺序:
alter table student change sex sex string first;
alter table student change name name string after sex;
3、修改分区
添加分区:
alter table student_ptn add partition(city='tiajin') partition(city='shanghai');
删除分区:
alter table student_ptn drop partition(city='tiajin');
alter table student_ptn drop partition(city='tiajin'),partition(city='shanghai');
修改分区的数据目录:
alter table student_ptn partition(city="beijing") set location "/stu_beijing"; XXXX
alter table student_ptn partition(city="beijing") set location "hdfs://hadoop0:8020/stu_beijing"; √√√√√√√
二、DML
1、导入数据
load
load data local inpath “/home/data/student.txt” into table student;
insert
insert into table student (id, name, sex, age, department) values (101,”huangbo”,”M”,222,”IT”);
创建分区表:
create table student_ptn (id int, name string, sex string, age int) partitioned by (department string) row format delimited fields terminated by “,”;
单重插入:
insert into table student_ptn partition (department = ‘IS’) select id,sex,name,age from student where department = ‘IS’;
insert into table student_ptn partition (department = ‘CS’) select id,sex,name,age from student where department = ‘CS’;
insert into table student_ptn partition (department = ‘MA’) select id,sex,name,age from student where department = ‘MA’;
多重插入:
from student
insert into table student_ptn partition (department = ‘IS’) select id,sex,name,age where department = ‘IS’
insert into table student_ptn partition (department = ‘CS’) select id,sex,name,age where department = ‘CS’
insert into table student_ptn partition (department = ‘MA’) select id,sex,name,age where department = ‘MA’
2、导出数据
insert overwrite local directory "/home/data/cs_student" select * from student where department = 'CS';
insert overwrite directory "/home/data/cs_student" select * from student where department = 'CS';
3、查询
select ... from ... join ... on... where group by having limit