寫在前面:
大家學習知識不用死抓怎麼實現,很多同學認爲學了套路能做到舉一反三就不錯了,這其實還是停留在“術”的層面。大家要學會了解底層的原理自己去折騰,所以這也是爲什麼我們要花將近一年左右的時間,去學 NDK 去學 Linux 內核,因爲很多東西網上也是搜索不到的。
監控死鎖:
主線程死鎖容易 ANR ,其他線程死鎖容易引起異常(不是閃退但會引起用戶殺死或卸載 App)。開發需求的時候我們其實很少會自己寫出死鎖( sdk 開發的除外) 很多情況下都是不小心調用了第三方的或者系統的一些 API 導致的。那我們有沒有辦法把線上死鎖引起的 ANR 上報到服務器呢?或者說有沒有什麼方法可以及時的監控到死鎖?先來看一個死鎖的例子
Thread thread1 = new Thread(new Runnable() {
@Override
public void run() {
synchronized (deadLock1) {
try {
sleep_(1);
} catch (Exception e) {
e.printStackTrace();
}
synchronized (deadLock2) {
Log.e("TAG","thread1");
}
}
}
}, "testThread1");
Thread thread2 = new Thread(new Runnable() {
@Override
public void run() {
synchronized (deadLock2) {
try {
sleep_(1);
} catch (Exception e) {
e.printStackTrace();
}
synchronized (deadLock1) {
Log.e("TAG","thread2");
}
}
}
}, "testThread2");
這是一個比較典型的死鎖例子,很多同學肉眼一般能看出來,但是到了線上我們就得做個自動分析,首先如果在本地排查,我們最好的方法是先 dump 到線程的信息
"testThread1@5890" prio=5 tid=0x5210 nid=NA waiting for monitor entry
java.lang.Thread.State: BLOCKED
waiting for testThread2@5889 to release lock on <0x1709> (a java.lang.Object)
at com.darren.optimize.day13.MainActivity$3.run(MainActivity.java:195)
- locked <0x1708> (a java.lang.Object)
at java.lang.Thread.run(Thread.java:784)
"testThread2@5889" prio=5 tid=0x5211 nid=NA waiting for monitor entry
java.lang.Thread.State: BLOCKED
waiting for testThread1@5890 to release lock on <0x1708> (a java.lang.Object)
at com.darren.optimize.day13.MainActivity$4.run(MainActivity.java:212)
- locked <0x1709> (a java.lang.Object)
at java.lang.Thread.run(Thread.java:784)
如果我們能拿到線程在等待哪個鎖釋放,當前持有哪個鎖這兩個信息的話,那麼一切就能迎刃而解了。上期有說的在 java 層是無法做到的,但是我們分析了線程創建的底層原理後在 Native 層找到了答案:
http://androidxref.com/9.0.0_r3/xref/art/runtime/monitor.cc
// 當前線程在競爭哪個鎖
mirror::Object* Monitor::GetContendedMonitor(Thread* thread) {
// This is used to implement JDWP's ThreadReference.CurrentContendedMonitor, and has a bizarre
// definition of contended that includes a monitor a thread is trying to enter...
mirror::Object* result = thread->GetMonitorEnterObject();
if (result == nullptr) {
// ...but also a monitor that the thread is waiting on.
MutexLock mu(Thread::Current(), *thread->GetWaitMutex());
Monitor* monitor = thread->GetWaitMonitor();
if (monitor != nullptr) {
result = monitor->GetObject();
}
}
return result;
}
// 當前鎖被哪個線程持有
uint32_t Monitor::GetLockOwnerThreadId(mirror::Object* obj) {
DCHECK(obj != nullptr);
LockWord lock_word = obj->GetLockWord(true);
switch (lock_word.GetState()) {
case LockWord::kHashCode:
// Fall-through.
case LockWord::kUnlocked:
return ThreadList::kInvalidThreadId;
case LockWord::kThinLocked:
return lock_word.ThinLockOwner();
case LockWord::kFatLocked: {
Monitor* mon = lock_word.FatLockMonitor();
return mon->GetOwnerThreadId();
}
default: {
LOG(FATAL) << "Unreachable";
UNREACHABLE();
}
}
}
有了這兩個方法,代碼實現起來就比較簡單了:
- 獲取所有的線程,判斷是不是 BOLCKED 狀態
- 調用 GetContendedMonitor 與 GetLockOwnerThreadId 獲取到被鎖住的線程
- 對死鎖進行分組,輸出死鎖對應的位置
// 初始化
extern "C"
JNIEXPORT jint JNICALL
Java_com_darren_optimize_day13_NativeThreadMonitor_nativeInit(JNIEnv *env, jclass clazz, jint level) {
api_level = level;
// dlopen libart.so
void *so_addr = ndk_dlopen("libart.so", RTLD_LAZY);
if (so_addr == NULL) {
return 1;
}
// Monitor::GetContendedMonitor
get_contended_monitor = ndk_dlsym(so_addr, "_ZN3art7Monitor19GetContendedMonitorEPNS_6ThreadE");
if (get_contended_monitor == NULL) {
return 2;
}
// Monitor::GetLockOwnerThreadId
get_lock_owner_thread = ndk_dlsym(so_addr, get_lock_owner_symbol_name(api_level));
if (get_lock_owner_thread == NULL) {
return 3;
}
return 0;
}
// 獲取當前線程鎖被哪個線程持有了
extern "C"
JNIEXPORT jint JNICALL
Java_com_darren_optimize_day13_NativeThreadMonitor_getContentThreadIdArt(JNIEnv *env, jclass clazz,
jlong native_thread) {
int monitor_thread_id = 0;
if (get_contended_monitor != nullptr && get_lock_owner_thread != nullptr) {
int monitorObj = ((int (*)(long)) get_contended_monitor)(native_thread);
if (monitorObj != 0) {
monitor_thread_id = ((int (*)(int)) get_lock_owner_thread)(monitorObj);
} else {
LOGD("GetContendedMonitor return null");
monitor_thread_id = 0;
}
}
return monitor_thread_id;
}
// 獲取線程 id
extern "C"
JNIEXPORT jint JNICALL
Java_com_darren_optimize_day13_NativeThreadMonitor_getThreadIdFromThreadPtr(JNIEnv *env, jclass clazz,
jlong nativeThread) {
if (nativeThread != 0) {
if (api_level > 20) { // 大於5.0系統
int *pInt = reinterpret_cast<int *>(nativeThread);
pInt = pInt + 3;
return *pInt; // 返回 monitor 所使用的Thread id
}
} else {
LOGE("suspendThreadArt failed");
}
return 0;
}
NativeThreadMonitor.nativeInit(Build.VERSION.SDK_INT);
Set<Thread> threads = NativeThreadMonitor.getAllThreads();
for (Thread thread : threads) {
if (thread.getState() == Thread.State.BLOCKED) {
long threadAddress = (long) ReflectUtil.getFieldObject(thread, "nativePeer");
// 這裏記一下,找不到地址,或者線程已經掛了,此時獲取到的可能是0和-1
if (threadAddress <= 0) {
continue;
}
int blockThreadId = NativeThreadMonitor.getContentThreadIdArt(threadAddress);
int curThreadId = NativeThreadMonitor.getThreadIdFromThreadPtr(threadAddress);
if (blockThreadId != 0 && curThreadId != 0) {
deadLock.put(curThreadId, new DeadLockThread(curThreadId, blockThreadId, thread));
}
}
}
try {
// 將所有情況進行分組
ArrayList<HashMap<Integer, Thread>> deadLockThreadGroup = deadLockThreadGroup();
// 再來找死鎖
JSONObject objectGroup = new JSONObject();
for (int i = 0; i < deadLockThreadGroup.size(); i++) {
// 所有的組拿出來
HashMap<Integer, Thread> group = deadLockThreadGroup.get(i);
JSONArray array = new JSONArray();
for (int curId : group.keySet()) {
// 獲取 DeadLockThread
DeadLockThread deadLockThread = deadLock.get(curId);
if (deadLockThread == null) {
continue;
}
// 獲取等待線程
Thread waitThread = group.get(deadLockThread.blockId);
if (waitThread == null) {
continue;
}
Thread deadThread = group.get(curId);
JSONObject temp = new JSONObject();
JSONArray stacks = new JSONArray();
temp.put("thread_name", deadThread.getName());
temp.put("thread_id", deadThread.getId());
temp.put("wait_thread", waitThread.getName());
temp.put("wait_id", waitThread.getId());
StackTraceElement[] stackTraceElements = deadThread.getStackTrace();
for (StackTraceElement stackTraceElement : stackTraceElements) {
stacks.put(stackTraceElement.toString());
}
temp.put("thread_stack", stacks);
array.put(temp);
}
objectGroup.put("dead_lock_group_" + i, array);
}
Log.e("TAG", objectGroup.toString());
} catch (Exception e) {
e.printStackTrace();
}
監控存活週期:
有些場景下我們想監控線程的存活週期,也就是說線程從開始啓動到運行結束總共存活了多長時間,佔了多少內存,佔了多少 CPU 等等,異常的情況下我們線下要給出警告線上要上報到服務器。目前我們能想到兩種方案一種是採用之前講的 ASM 插樁的方式,但是這種方案很多場景不適用;還有一種是今天要講到的 Native 插樁。插樁點依舊是之前的線程創建的底層原理:
http://androidxref.com/9.0.0_r3/xref/art/runtime/thread.cc
// 最終想監控這個方法
void* Thread::CreateCallback(void* arg) {
// ...
}
void *(*old_create_call_back)(void *) = NULL;
void *create_call_back(void *args) {
// 記錄開始時間
long startTime = time(NULL);
// 調用原始方法
void *result = old_create_call_back(args);
// 獲取當前線程信息,計算輸出存活時間
int tid = gettid();
const char *thread_name = getThreadName(gettid());
long alive_time = time(NULL) - startTime;
LOGE("線程信息:thread_id = %d, thread_name = %s, alive_time = %lds", tid, thread_name, alive_time);
// 獲取內存佔用,獲取 cpu 佔用率,異常情況輸出警告
return result;
}
extern "C"
JNIEXPORT void JNICALL
Java_com_darren_optimize_day13_NativeThreadMonitor_monitoringThread(JNIEnv *env, jclass clazz) {
void *so_addr = ndk_dlopen("libart.so", RTLD_LAZY);
void *thread_create_call_back = ndk_dlsym(so_addr, "_ZN3art6Thread14CreateCallbackEPv");
if (registerInlineHook((uint32_t) thread_create_call_back, (uint32_t) create_call_back,
(uint32_t **) &old_create_call_back) != ELE7EN_OK) {
LOGE("monitoringThread registerInlineHook error");
} else {
LOGE("monitoringThread registerInlineHook ok");
}
if (inlineHook((uint32_t) thread_create_call_back) != ELE7EN_OK) {
LOGE("monitoringThread inlineHook error");
} else {
LOGE("monitoringThread inlineHook ok");
}
}
監控 CPU 佔用率:
cpu 佔用率比較簡單,我們只需要解析到 /proc/pid/task/tid/stat 與 /proc/pid/stat 即可。
// 進程 stat 信息
extern const char *getProgressInfo() {
// 讀一個文件
char *path = (char *) calloc(1, PATH_MAX);
char *line = (char *) calloc(1, THREAD_NAME_LENGTH);
snprintf(path, PATH_MAX, "/proc/%d/stat", getpid());
FILE *commFile = NULL;
if (commFile = fopen(path, "r")) {
fgets(line, THREAD_NAME_LENGTH, commFile);
fclose(commFile);
}
if (line) {
int length = strlen(line);
if (line[length - 1] == '\n') {
line[length - 1] = '\0';
}
}
LOGE("progress info ->%s", line);
free(path);
return line;
}
// 線程 stat 信息
extern const char *getThreadInfo() {
// 讀一個文件
char *path = (char *) calloc(1, PATH_MAX);
char *line = (char *) calloc(1, THREAD_NAME_LENGTH);
snprintf(path, PATH_MAX, "/proc/%d/task/%d/stat", getpid(), gettid());
FILE *commFile = NULL;
if (commFile = fopen(path, "r")) {
fgets(line, THREAD_NAME_LENGTH, commFile);
fclose(commFile);
}
if (line) {
int length = strlen(line);
if (line[length - 1] == '\n') {
line[length - 1] = '\0';
}
}
LOGE("thread info ->%s", line);
free(path);
return line;
}
寫在最後:
效能優化這東西其實可做可不做,不像需求能快速的看到收益和效果,所以這也是很多同學比較缺失的一個部分。爲什麼我們要看重這點,因爲今天市場上比較成功的公司基本都做到了"一拖三" 。首先,是團隊很強 - 創始人和團隊很強,在一個比較強的團隊帶領下,需要做到另外三點,要麼是把用戶體驗提升了、要麼能降低成本、要麼能提升效率,有的時候我們的成本也沒下降,效率也沒提升,但是如果能把用戶體驗做得極致,也可以。總之,在一個優秀的、成功的團隊基礎之上,我們只要能夠把用戶體驗、能夠把成本或者效率這三者至少做到一點,同時另外兩點又沒有減損的話,基本上你就可以成功了。
視頻鏈接:https://pan.baidu.com/s/1VNIzgGUhfqBNmarkpt8tIw
視頻密碼:kzjj