# Sqoop提供了增量import數據的方法,可以只從RDBMS中獲取上次import操作後的新增數據。
Argument Description
--check-column (col) Specifies the column to be examined when determining which rows to import. (the column should not be of type CHAR/NCHAR/VARCHAR/VARNCHAR/ LONGVARCHAR/LONGNVARCHAR)
--incremental (mode) Specifies how Sqoop determines which rows are new. Legal values for mode include append and lastmodified.
--last-value (value) Specifies the maximum value of the check column from the previous import.
# 注意
--append and --delete-target-dir can not be used together.
# Sqoop支持兩種增量import的方式:append和lastmodified,可以通過 --incremental參數指定以哪種方式import
CREATE TABLE test.`sqoop_append` (
`id` int(10) NOT NULL AUTO_INCREMENT,
`operator` int(10) NOT NULL,
`op_time` datetime NOT NULL,
PRIMARY KEY (`id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
# 使用python向數據表每隔一秒寫入一條記錄:
import MySQLdb
import time
db = MySQLdb.connect("mysql_host", "root", "mysql-passwod", "test", charset="utf8")
cursor = db.cursor()
def insert_per_second():
while True:
insert_sql = "insert into test.sqoop_append(operator, op_time) values (1, now())"
print(insert_sql)
cursor.execute(insert_sql)
time.sleep(1)
db.commit()
insert_per_second()
首次執行導入程序:
sqoop-import \
--connect jdbc:mysql://mysql_host:3306/test \
--username root \
--password-file /test/my_passwd \
--query 'SELECT * from test.sqoop_append WHERE $CONDITIONS' \
--num-mappers 2 \
--split-by id \
--incremental append \
--check-column id \
--last-value 0 \
--target-dir /user/hadoop/sqoop_append \
--as-parquetfile
執行輸出爲:
2019-07-16 17:09:22,634 INFO tool.ImportTool: Incremental import complete! To run another incremental import of all data following this import, supply the following arguments:
2019-07-16 17:09:22,634 INFO tool.ImportTool: --incremental append
2019-07-16 17:09:22,634 INFO tool.ImportTool: --check-column id
2019-07-16 17:09:22,634 INFO tool.ImportTool: --last-value 188
2019-07-16 17:09:22,634 INFO tool.ImportTool: (Consider saving this with 'sqoop job --create')
第二次導入數據:
sqoop-import \
--connect jdbc:mysql://mysql_host:3306/test \
--username root \
--password-file /test/my_passwd \
--query 'SELECT * from test.sqoop_append WHERE $CONDITIONS' \
--num-mappers 2 \
--split-by id \
--incremental append \
--check-column id \
--last-value 188 \
--target-dir /user/hadoop/sqoop_append \
--as-parquetfile
打印輸出爲:
2019-07-16 17:12:12,502 INFO tool.ImportTool: --incremental append
2019-07-16 17:12:12,502 INFO tool.ImportTool: --check-column id
2019-07-16 17:12:12,502 INFO tool.ImportTool: --last-value 325
2019-07-16 17:12:12,502 INFO tool.ImportTool: (Consider saving this with 'sqoop job --create')