Airflow安裝和使用介紹
1·Airflow安裝和啓動
python3 -m venv /pyenv/airflow # 創建airflow python虛擬環境
. /pyenv/airflow/bin/activate # 激活虛擬環境
export AIRFLOW_HOME=~/airflow # 設置airflow的安裝目錄
pip install apache-airflow # 安裝apache-airflow
airflow initdb # 初始化數據庫,airflow默認數據庫爲sqlite
airflow webserver -p 8080 # 啓動webserver, 訪問http://ip:8080可訪問
2.Airflow配置文件~/airflow/airflow.cfg
的常用介紹
# web server
[webserver]
# web server啓動時的IP
web_server_host = 0.0.0.0
# web server啓動時的端口
web_server_port = 8080
# 認證方式
authenticate = True # 默認爲False,不需要認證
auth_backend = airflow.contrib.auth.backends.password_auth # 密碼認證
# auth_backend = airflow.contrib.auth.backends.ldap_auth # ldap認證,需配置對應的[ldap]
# 採用ldap認證方式需配置此項
[ldap]
# set this to ldaps://<your.ldap.server>:<port>
uri = ldap://ip:389
user_filter = objectClass=*
user_name_attr = sAMAccountName # 對應web UI登錄賬號
group_member_attr = memberOf
superuser_filter = # 可爲空
data_profiler_filter = # 可爲空
bind_user = cn=Manager,dc=example,dc=com
bind_password = your_password # 你得AD密碼
basedn = DC=dpbg,DC=lh,DC=com
cacert = /etc/ca/ldap_ca.crt
search_scope = SUBTREE # 使用AD域登錄,必須設置爲SUBTREE
# 郵件服務
[smtp]
smtp_host = # smtp host ip
smtp_starttls = True
smtp_ssl = False
smtp_port = # smtp port
smtp_mail_from = [email protected] # from郵件地址
3.密碼認證方式需手動添加用戶。我寫了添加用戶的腳本,可通過執行命python insert_user.py
Airflow用戶
import getpass
import airflow
from airflow import models, settings
from airflow.contrib.auth.backends.password_auth import PasswordUser
def inser_user(username, email, password, is_superuser):
user = PasswordUser(models.User())
user.username = username # 設置賬號
user.email = email # 設置郵箱
user.password = password # 設置密碼
user.superuser = is_superuser # 是否爲超級管理員
session = settings.Session()
session.add(user) # 添加用戶
session.commit() # 提交
session.close()
if __name__ == '__main__':
username = input("Enter your username:")
email = input("Enter your email:")
is_superuser = input("Is super user(Y/N)?")
is_superuser = 1 if is_superuser.upper() == 'Y' else 0
password1 = getpass.getpass("Enter your password:")
password2 = getpass.getpass("Please re-enter your password:")
if password1 != password2: # 驗證兩次輸入的密碼是否相同
print("Passwords entered twice are inconsistent!")
else: # 添加賬號
inser_user(username=username, email=email, password=password1, is_superuser=is_superuser)
print("Success!")
4.ldap認證方式,登錄WEB UI時報錯ldap_ca.crt文件不存在,解決方法修改源碼vim /pyenv/airflow/lib/python3.6/site-packages/airflow/contrib/auth/backends/ldap_auth.py
# 原代碼註釋掉
# tls_configuration = Tls(validate=ssl.CERT_REQUIRED,
# ca_certs_file=cacert)
# server = Server(conf.get("ldap", "uri"),
# use_ssl=True,
# tls=tls_configuration)
# 修改後代碼
server = Server(conf.get("ldap", "uri"),
use_ssl=False,
tls=None)
5.Airflow啓動和關閉比較麻煩,我自己寫了腳本,大家可以參考下。也可使用systemd運行Airflow(感興趣的可自己嘗試)
# 啓動腳本start_airflow.sh
#!/bin/bash
airflow webserver -D && airflow scheduler -D
echo "Start success"
# 關閉腳本stop_airflow.sh
#!/bin/bash
ps -ef|egrep 'scheduler|airflow-webserver'|grep -v grep|awk '{print $2}'|xargs kill -9
rm -rf ~/airflow/*.pid
echo "Stop success"
6.Airflow使用,本文以遠程執行python腳本爲例。通過WEB UI中Admin–>Connections設置ssh連接方式(見下圖)。在~/airflow/dags
配置python腳本,並重啓Airflow。
from datetime import datetime
from datetime import timedelta
import airflow
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.operators.email_operator import EmailOperator
from airflow.utils.trigger_rule import TriggerRule
# These args will get passed on to each operator
# You can override them on a per-task basis during operator initialization
default_args = {
'owner': 'Airflow',
'depends_on_past': False,
'start_date': airflow.utils.dates.days_ago(0), # dag執行開始時間
'email': ['[email protected]'],
'email_on_failure': True,
'email_on_retry': True,
# 'retries': 1,
# 'retry_delay': timedelta(seconds=30),
# 'queue': 'bash_queue',
# 'pool': 'backfill',
# 'priority_weight': 10,
#'end_date': datetime(2019, 12, 4, 17, 30),
'max_active_runs':2
# 'wait_for_downstream': True,
# 'dag': dag,
# 'sla': timedelta(hours=2),
# 'execution_timeout': timedelta(seconds=300),
# 'on_failure_callback': some_function,
# 'on_success_callback': some_other_function,
# 'on_retry_callback': another_function,
# 'trigger_rule': u'all_success'
}
dag = DAG(
'dag_name',
default_args=default_args,
description='dag描述信息',
schedule_interval='0 0 * * *', # 運行時間
)
task = SSHOperator(
ssh_conn_id='ssh_id', # Admin-->Connections設置shh鏈接ID
task_id='task_id',
command='python python_script.py >> /var/log/dag_name.log', # 執行命令配置
dag=dag
)
task_failed = EmailOperator (
dag=dag,
trigger_rule=TriggerRule.ONE_FAILED, # task執行失敗觸發
task_id="task_failed_id",
to=["[email protected]"], # 執行失敗,發郵件通知
subject="郵件主旨",
html_content='郵件內容'
)
task_failed.set_upstream([task]) # task之間的依賴關係