Airflow安裝和使用介紹

Airflow安裝和使用介紹

1·Airflow安裝和啓動

python3 -m venv /pyenv/airflow    # 創建airflow python虛擬環境
. /pyenv/airflow/bin/activate    # 激活虛擬環境

export AIRFLOW_HOME=~/airflow     # 設置airflow的安裝目錄
pip install apache-airflow    # 安裝apache-airflow
airflow initdb    # 初始化數據庫,airflow默認數據庫爲sqlite
airflow webserver -p 8080    # 啓動webserver, 訪問http://ip:8080可訪問

2.Airflow配置文件~/airflow/airflow.cfg的常用介紹

# web server
[webserver]
# web server啓動時的IP
web_server_host = 0.0.0.0
# web server啓動時的端口
web_server_port = 8080

# 認證方式
authenticate = True        # 默認爲False,不需要認證
auth_backend = airflow.contrib.auth.backends.password_auth    # 密碼認證
# auth_backend = airflow.contrib.auth.backends.ldap_auth    # ldap認證,需配置對應的[ldap]

# 採用ldap認證方式需配置此項
[ldap]
# set this to ldaps://<your.ldap.server>:<port>
uri = ldap://ip:389
user_filter = objectClass=*
user_name_attr = sAMAccountName        # 對應web UI登錄賬號
group_member_attr = memberOf
superuser_filter =        # 可爲空
data_profiler_filter =    # 可爲空
bind_user = cn=Manager,dc=example,dc=com
bind_password = your_password     # 你得AD密碼
basedn = DC=dpbg,DC=lh,DC=com
cacert = /etc/ca/ldap_ca.crt
search_scope = SUBTREE    # 使用AD域登錄,必須設置爲SUBTREE

# 郵件服務
[smtp]
smtp_host =        # smtp host ip
smtp_starttls = True
smtp_ssl = False
smtp_port =     # smtp port
smtp_mail_from = [email protected]    # from郵件地址

3.密碼認證方式需手動添加用戶。我寫了添加用戶的腳本,可通過執行命python insert_user.py Airflow用戶

import getpass
import airflow
from airflow import models, settings
from airflow.contrib.auth.backends.password_auth import PasswordUser


def inser_user(username, email, password, is_superuser):
    user = PasswordUser(models.User())
    user.username = username    # 設置賬號
    user.email = email      # 設置郵箱
    user.password = password    # 設置密碼
    user.superuser = is_superuser   # 是否爲超級管理員
    session = settings.Session()
    session.add(user)       # 添加用戶
    session.commit()    # 提交
    session.close()


if __name__ == '__main__':
    username = input("Enter your username:")
    email = input("Enter your email:")
    is_superuser = input("Is super user(Y/N)?")
    is_superuser = 1 if is_superuser.upper() == 'Y' else 0
    password1 = getpass.getpass("Enter your password:")
    password2 = getpass.getpass("Please re-enter your password:")
    if password1 != password2:  # 驗證兩次輸入的密碼是否相同
        print("Passwords entered twice are inconsistent!")
    else:   # 添加賬號
        inser_user(username=username, email=email, password=password1, is_superuser=is_superuser)
        print("Success!")

在這裏插入圖片描述

4.ldap認證方式,登錄WEB UI時報錯ldap_ca.crt文件不存在,解決方法修改源碼vim /pyenv/airflow/lib/python3.6/site-packages/airflow/contrib/auth/backends/ldap_auth.py

# 原代碼註釋掉
# tls_configuration = Tls(validate=ssl.CERT_REQUIRED,
#                         ca_certs_file=cacert)

# server = Server(conf.get("ldap", "uri"),
#                 use_ssl=True,
#                 tls=tls_configuration)


# 修改後代碼
server = Server(conf.get("ldap", "uri"),
                use_ssl=False,
                tls=None)

5.Airflow啓動和關閉比較麻煩,我自己寫了腳本,大家可以參考下。也可使用systemd運行Airflow(感興趣的可自己嘗試)

# 啓動腳本start_airflow.sh
#!/bin/bash
airflow webserver -D && airflow scheduler -D
echo "Start success"


# 關閉腳本stop_airflow.sh
#!/bin/bash
ps -ef|egrep 'scheduler|airflow-webserver'|grep -v grep|awk '{print $2}'|xargs kill -9
rm -rf ~/airflow/*.pid
echo "Stop success"

6.Airflow使用,本文以遠程執行python腳本爲例。通過WEB UI中Admin–>Connections設置ssh連接方式(見下圖)。在~/airflow/dags配置python腳本,並重啓Airflow。

from datetime import datetime
from datetime import timedelta

import airflow
from airflow import DAG
from airflow.operators.bash_operator import BashOperator
from airflow.contrib.operators.ssh_operator import SSHOperator
from airflow.operators.email_operator import EmailOperator

from airflow.utils.trigger_rule import TriggerRule


# These args will get passed on to each operator
# You can override them on a per-task basis during operator initialization
default_args = {
    'owner': 'Airflow', 
    'depends_on_past': False,
    'start_date': airflow.utils.dates.days_ago(0),  # dag執行開始時間
    'email': ['[email protected]'],
    'email_on_failure': True,
    'email_on_retry': True,
    # 'retries': 1,
    # 'retry_delay': timedelta(seconds=30),
    # 'queue': 'bash_queue',
    # 'pool': 'backfill',
    # 'priority_weight': 10,
    #'end_date': datetime(2019, 12, 4, 17, 30),
    'max_active_runs':2
    # 'wait_for_downstream': True,
    # 'dag': dag,
    # 'sla': timedelta(hours=2),
    # 'execution_timeout': timedelta(seconds=300),
    # 'on_failure_callback': some_function,
    # 'on_success_callback': some_other_function,
    # 'on_retry_callback': another_function,
    # 'trigger_rule': u'all_success'
}


dag = DAG(
    'dag_name',
    default_args=default_args,
    description='dag描述信息',
    schedule_interval='0 0 * * *',   # 運行時間
)



task = SSHOperator(
    ssh_conn_id='ssh_id',  # Admin-->Connections設置shh鏈接ID
    task_id='task_id', 
    command='python python_script.py >> /var/log/dag_name.log',    # 執行命令配置
    dag=dag
)


task_failed = EmailOperator (
    dag=dag,
    trigger_rule=TriggerRule.ONE_FAILED,    # task執行失敗觸發
    task_id="task_failed_id",
    to=["[email protected]"],     # 執行失敗,發郵件通知
    subject="郵件主旨",
    html_content='郵件內容'
)
 

task_failed.set_upstream([task])    # task之間的依賴關係
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章