Sqoop是一个etl工具,是sql to hadoop的缩写。即将关系型数据库数据导入hadoop中。可以用sqoop来创建hive表和导数据,实际上也是在hdfs中创建目录和将数据存储在hdfs中。
1.Sqoop基本命令
@see url
2.mysql导入hive
--hive-database qianyang #指定hive数据库
@see link
3.Sqoop job的CRUD
@see sqoop_crud
4.查看Hive建表语句
查看hive建表语句:show create table tablename; 查看hive表结构:describe tablename; 简写:desc tablename;
/usr/bin/sqoop create-hive-table --connect jdbc:oracle:thin:@$server:$port:$mysql_database --username $usern ame --password $password --hive-database $hdb --table $mysql_table #>sqoop -version Sqoop 1.4.6-cdh5.12.2
CREATE TABLE `test`( `id` string, ... `create_date` string) COMMENT 'Imported by sqoop on 2018/06/19 13:32:17' ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' WITH SERDEPROPERTIES ( 'field.delim'='\u0001', 'line.delim'='\n', 'serialization.format'='\u0001') STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION 'hdfs://ketech-server51:8020/user/hive/warehouse/mydb.db/test' TBLPROPERTIES ( 'transient_lastDdlTime'='1529415143')
5.job示例
#@see https://www.jianshu.com/p/084d1b1e094c
#!/bin/bash source /etc/profile source /root/.bashrc source /etc/hive/conf/hive-env.sh source /etc/sqoop/conf/sqoop-env.sh
##############################################
## $1:日期 $2:表名 ## 第一个参数为日期,第二个参数为mysql表名 ############################################## #hive库名 default will be:default hdb=test #hive表名 hive_table=t_test #mysql表名 mysql_table=T_TEST #mysql服务器地址 server=192.168.0.75 #mysql端口号 port=1521 #数据库名 mysql_database=orcl #用户名 username=root #密码 password=123456 job_name=cdr_record #判断Hive是否存在,不存在执行下面创建语句,否则跳过 /usr/bin/hive -e "use $hdb;select * from $hive_table limit 1;" if [ $? -ne 0 ] then echo "表不存在,执行创建表结构" /usr/bin/sqoop create-hive-table --connect jdbc:oracle:thin:@$server:$port:$mysql_database --username $username --password $password --hive-database $hdb --table $mysql_table else echo "表已存在,执行增量导入。。。" fi #exit # #一种是 append,即通过指定一个递增的列,比如: #--incremental append --check-column num_iid --last-value 0 #另种是可以根据时间戳,比如: #--incremental lastmodified --check-column created --last-value '2012-02-01 11:0:00' #就是只导入created 比'2012-02-01 11:0:00'更大的数据。 sqoop job --show $job_name > 1 > /dev/null 2>&1 if [ $? -ne 0 ] then echo "job不存在,执行创建" echo "创建job" #append /usr/bin/sqoop job --create $job_name -- import --connect jdbc:oracle:thin:@$server:$port:$mysql_database --username $username --password $password --table $mysql_table --fields-terminated-by "\001" --null-string '\\N' --null-non-string '\\N' --target-dir /user/hive/warehouse/test.db/t_test --incremental lastmodified --check-column CREATE_DATE -m 1 -z --append else echo "job已存在,执行增量导入。。。" echo "append增量导入模式启动。。。" /usr/bin/sqoop job --exec $job_name fi exit
6.建表源码
/** * Sqoop 1.4.3 */ package org.apache.sqoop.hive; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Map; import java.util.Date; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Properties; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.sqoop.io.CodecMap; import com.cloudera.sqoop.SqoopOptions; import com.cloudera.sqoop.manager.ConnManager; /** * Creates (Hive-specific) SQL DDL statements to create tables to hold data * we're importing from another source. * * After we import the database into HDFS, we can inject it into Hive using the * CREATE TABLE and LOAD DATA INPATH statements generated by this object. */ public class TableDefWriter { public static final Log LOG = LogFactory.getLog(TableDefWriter.class.getName()); private SqoopOptions options; private ConnManager connManager; private Configuration configuration; private String inputTableName; private String outputTableName; private boolean commentsEnabled; /** * Creates a new TableDefWriter to generate a Hive CREATE TABLE statement. * * @param opts * program-wide options * @param connMgr * the connection manager used to describe the table. * @param inputTable * the name of the table to load. * @param outputTable * the name of the Hive table to create. * @param config * the Hadoop configuration to use to connect to the dfs * @param withComments * if true, then tables will be created with a timestamp comment. */ public TableDefWriter(final SqoopOptions opts, final ConnManager connMgr, final String inputTable, final String outputTable, final Configuration config, final boolean withComments) { this.options = opts; this.connManager = connMgr; this.inputTableName = inputTable; this.outputTableName = outputTable; this.configuration = config; this.commentsEnabled = withComments; } private Map<String, Integer> externalColTypes; /** * Set the column type map to be used. (dependency injection for testing; * not used in production.) */ public void setColumnTypes(Map<String, Integer> colTypes) { this.externalColTypes = colTypes; LOG.debug("Using test-controlled type map"); } /** * Get the column names to import. */ private String[] getColumnNames() { String[] colNames = options.getColumns(); if (null != colNames) { return colNames; // user-specified column names. } else if (null != externalColTypes) { // Test-injection column mapping. Extract the col names from this. ArrayList<String> keyList = new ArrayList<String>(); for (String key : externalColTypes.keySet()) { keyList.add(key); } return keyList.toArray(new String[keyList.size()]); } else if (null != inputTableName) { return connManager.getColumnNames(inputTableName); } else { return connManager.getColumnNamesForQuery(options.getSqlQuery()); } } /** * @return the CREATE TABLE statement for the table to load into hive. */ public String getCreateTableStmt() throws IOException { Map<String, Integer> columnTypes; Properties userMapping = options.getMapColumnHive(); if (externalColTypes != null) { // Use pre-defined column types. columnTypes = externalColTypes; } else { // Get these from the database. if (null != inputTableName) { columnTypes = connManager.getColumnTypes(inputTableName); } else { columnTypes = connManager.getColumnTypesForQuery(options.getSqlQuery()); } } String[] colNames = getColumnNames(); StringBuilder sb = new StringBuilder(); if (options.doFailIfHiveTableExists()) { sb.append("CREATE TABLE `").append(outputTableName).append("` ( "); } else { sb.append("CREATE TABLE IF NOT EXISTS `"); sb.append(outputTableName).append("` ( "); } // Check that all explicitly mapped columns are present in result set for (Object column : userMapping.keySet()) { boolean found = false; for (String c : colNames) { if (c.equals(column)) { found = true; break; } } if (!found) { throw new IllegalArgumentException("No column by the name " + column + "found while importing data"); } } boolean first = true; String partitionKey = options.getHivePartitionKey(); for (String col : colNames) { if (col.equals(partitionKey)) { throw new IllegalArgumentException("Partition key " + col + " cannot " + "be a column to import."); } if (!first) { sb.append(", "); } first = false; Integer colType = columnTypes.get(col); String hiveColType = userMapping.getProperty(col); if (hiveColType == null) { hiveColType = connManager.toHiveType(inputTableName, col, colType); } if (null == hiveColType) { throw new IOException("Hive does not support the SQL type for column " + col); } sb.append('`').append(col).append("` ").append(hiveColType); if (HiveTypes.isHiveTypeImprovised(colType)) { LOG.warn("Column " + col + " had to be cast to a less precise type in Hive"); } } sb.append(") "); if (commentsEnabled) { DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); String curDateStr = dateFormat.format(new Date()); sb.append("COMMENT 'Imported by sqoop on " + curDateStr + "' "); } if (partitionKey != null) { sb.append("PARTITIONED BY (").append(partitionKey).append(" STRING) "); } sb.append("ROW FORMAT DELIMITED FIELDS TERMINATED BY '"); sb.append(getHiveOctalCharCode((int) options.getOutputFieldDelim())); sb.append("' LINES TERMINATED BY '"); sb.append(getHiveOctalCharCode((int) options.getOutputRecordDelim())); String codec = options.getCompressionCodec(); if (codec != null && (codec.equals(CodecMap.LZOP) || codec.equals(CodecMap.getCodecClassName(CodecMap.LZOP)))) { sb.append("' STORED AS INPUTFORMAT " + "'com.hadoop.mapred.DeprecatedLzoTextInputFormat'"); sb.append(" OUTPUTFORMAT " + "'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'"); } else { sb.append("' STORED AS TEXTFILE"); } LOG.debug("Create statement: " + sb.toString()); return sb.toString(); } /** * @return the LOAD DATA statement to import the data in HDFS into hive. */ public String getLoadDataStmt() throws IOException { Path finalPath = getFinalPath(); StringBuilder sb = new StringBuilder(); sb.append("LOAD DATA INPATH '"); sb.append(finalPath.toString() + "'"); if (options.doOverwriteHiveTable()) { sb.append(" OVERWRITE"); } sb.append(" INTO TABLE `"); sb.append(outputTableName); sb.append('`'); if (options.getHivePartitionKey() != null) { sb.append(" PARTITION (").append(options.getHivePartitionKey()).append("='") .append(options.getHivePartitionValue()).append("')"); } LOG.debug("Load statement: " + sb.toString()); return sb.toString(); } public Path getFinalPath() throws IOException { String warehouseDir = options.getWarehouseDir(); if (null == warehouseDir) { warehouseDir = ""; } else if (!warehouseDir.endsWith(File.separator)) { warehouseDir = warehouseDir + File.separator; } // Final path is determined in the following order: // 1. Use target dir if the user specified. // 2. Use input table name. String tablePath = null; String targetDir = options.getTargetDir(); if (null != targetDir) { tablePath = warehouseDir + targetDir; } else { tablePath = warehouseDir + inputTableName; } FileSystem fs = FileSystem.get(configuration); return new Path(tablePath).makeQualified(fs); } /** * Return a string identifying the character to use as a delimiter in Hive, * in octal representation. Hive can specify delimiter characters in the * form '\ooo' where ooo is a three-digit octal number between 000 and 177. * Values may not be truncated ('\12' is wrong; '\012' is ok) nor may they * be zero-prefixed (e.g., '\0177' is wrong). * * @param charNum * the character to use as a delimiter * @return a string of the form "\ooo" where ooo is an octal number in [000, * 177]. * @throws IllegalArgumentException * if charNum > 0177. */ public static String getHiveOctalCharCode(int charNum) { if (charNum > 0177) { throw new IllegalArgumentException("Character " + charNum + " is an out-of-range delimiter"); } return String.format("\\%03o", charNum); } }