本以为写写异构代码也不算一件难事。毕竟做 Java 开发,日常也写些 C 代码,无论如何也不至于到棘手到地步。哪曾想,JNI 开发不难,但问题颇多。
为了排查一个 JNI 的堆外内存泄露问题,简直是绞尽脑汁而不得其门。最后查找的问题原因也特别简单,但凡认真学习了 JNI 开发都能够避免。
问题代码类似于下列代码:
#include "Main.h"
JNIEXPORT void JNICALL Java_Main_sayHi (JNIEnv *env, jobject jobj, jstring jstr)
{
const char *str = (*env)->GetStringUTFChars(env, jstr, 0);
int len = (*env)->GetStringUTFLength(env, jstr);
// ... some code omitted
return ;
}
问题表现
最初的表现很明显,2核4G的机器,2GB的 Java 堆,最终物理内存不够用,导致 Java 进程被强制杀掉。几乎毫无疑问,就是 Java 堆外内存引发的问题。不过遗憾的是,物理内存不够用,没有留下足够多的堆栈详情。另外找了一台快要把内存打满的机器,打印信息。
$ pmap -x 27765 | sort -n -k3 | tail -n 50
00007f460c9ac000 55632 23896 23896 rw--- [ anon ]
00007f45d0000000 38980 38980 38980 rw--- [ anon ]
00007f45dc000000 45980 39512 39512 rw--- [ anon ]
00007f45e0000000 46120 39652 39652 rw--- [ anon ]
00007f45d4000000 42160 41744 41744 rw--- [ anon ]
00007f45c8000000 45992 45992 45992 rw--- [ anon ]
00007f4610000000 48896 48872 48872 rwx-- [ anon ]
00007f45f8000000 51084 51084 51084 rw--- [ anon ]
00007f4600000000 52328 52328 52328 rw--- [ anon ]
00007f4608000000 58332 55452 55452 rw--- [ anon ]
00000000014f8000 57036 56908 56908 rw--- [ anon ]
00007f45d8000000 63760 57292 57292 rw--- [ anon ]
00007f45e8000000 58488 58488 58488 rw--- [ anon ]
00007f4604000000 59912 59912 59912 rw--- [ anon ]
00007f45fc000000 60080 60080 60080 rw--- [ anon ]
00007f45e4000000 64460 64452 64452 rw--- [ anon ]
00007f45f4000000 65072 65072 65072 rw--- [ anon ]
00007f4620000000 65536 65536 65536 rw--- [ anon ]
0000000080000000 2105600 2008240 2008240 rw--- [ anon ] # 2GB 的 Java 堆
total kB 4885820 3040036 3030476
比较多的内存块都在 64M 左右大小。确实也很明显,就是申请的内存没有得到释放。
问题排查
C 不似 Java,没有 Garbage Collection 。所有申请的内存都必须主动进行释放。唯一的问题就是 GetStringUTFChars
得到的内存是否会被 JVM 统一管理。
结果当然是不会,虽然是 Java 与 C 衔接的胶水代码,但这部分得到的内存需要自行管理。不过,JNI 也提供了函数来进行释放。
/* from src/share/vm/prims/jni.cpp */
JNI_ENTRY(const char*, jni_GetStringUTFChars(JNIEnv *env, jstring string, jboolean *isCopy))
JNIWrapper("GetStringUTFChars");
char* result = NULL;
oop java_string = JNIHandles::resolve_non_null(string);
if (java_lang_String::value(java_string) != NULL) {
size_t length = java_lang_String::utf8_length(java_string);
/* 申请堆内存 */
result = AllocateHeap(length + 1, mtInternal, 0, AllocFailStrategy::RETURN_NULL);
if (result != NULL) {
java_lang_String::as_utf8_string(java_string, result, (int) length + 1);
if (isCopy != NULL) {
*isCopy = JNI_TRUE;
}
}
}
return result;
JNI_END
/* from src/share/vm/memory/allocation.inline.hpp */
inline char* AllocateHeap(size_t size, MEMFLAGS flags,
const NativeCallStack& stack,
AllocFailType alloc_failmode = AllocFailStrategy::EXIT_OOM) {
// 申请内存
char* p = (char*) os::malloc(size, flags, stack);
#ifdef ASSERT
if (PrintMallocFree) trace_heap_malloc(size, "AllocateHeap", p);
#endif
if (p == NULL && alloc_failmode == AllocFailStrategy::EXIT_OOM) {
vm_exit_out_of_memory(size, OOM_MALLOC_ERROR, "AllocateHeap");
}
return p;
}
/* from src/share/vm/prims/jni.cpp */
JNI_LEAF(void, jni_ReleaseStringUTFChars(JNIEnv *env, jstring str, const char *chars))
JNIWrapper("ReleaseStringUTFChars");
if (chars != NULL) {
// 释放内存
FreeHeap((char*) chars);
}
);
JNI_END
/* from src/share/vm/memory/allocation.inline.hpp */
inline void FreeHeap(void* p, MEMFLAGS memflags = mtInternal) {
#ifdef ASSERT
if (PrintMallocFree) trace_heap_free(p);
#endif
os::free(p, memflags);
}
从名称就可以看出来,GetStringUTFChars
和 ReleaseStringUTFChars
是相对的方法。而且默认走的都是 Glibc 的 malloc
和 free
。
问题验证
为了验证问题的原因和最终的现象,整了一段代码来测试效果。
import java.io.IOException;
public class Main {
static {
System.load("/root/jni/Main.so");
}
private static final String value = "AAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHH";
public native void sayHi(String value);
public static void main(String[] args) throws IOException {
Main main = new Main();
while (true)
{
for (int i=0;i<100000;i++) {
char c1 = (char) ('A' + Math.random() * 26);
char c2 = (char) ('a' + Math.random() * 26);
main.sayHi(value + c1 + c2);
}
System.gc();
int val = System.in.read();
if (val == '1')
break;
}
}
}
#include "Main.h"
JNIEXPORT void JNICALL Java_Main_sayHi (JNIEnv *env, jobject jobj, jstring jstr)
{
const char *str = (*env)->GetStringUTFChars(env, jstr, 0);
int len = (*env)->GetStringUTFLength(env, jstr);
printf("%p\n", str);
#ifdef SUCCESS
(*env)->ReleaseStringUTFChars(env, jstr, str);
#endif
return ;
}
$ java Main
$ pmap -x 22690 | sort -n -k3
# ...
00007f0d44000000 65536 65500 65500 rw--- [ anon ]
00007f0d04000000 131072 131068 131068 rw--- [ anon ]
00007f0cc4000000 131072 131072 131072 rw--- [ anon ]
00007f0ccc000000 131072 131072 131072 rw--- [ anon ]
00007f0cd4000000 131072 131072 131072 rw--- [ anon ]
00007f0cdc000000 131072 131072 131072 rw--- [ anon ]
00007f0ce4000000 131072 131072 131072 rw--- [ anon ]
00007f0cec000000 131072 131072 131072 rw--- [ anon ]
00007f0cf4000000 131072 131072 131072 rw--- [ anon ]
00007f0cfc000000 131072 131072 131072 rw--- [ anon ]
total kB 3822700 1293952 1276900
确实引起了内存的增长,而将内存 dump 下了之后,也看到了大量字符串。
$ PID=23756; grep rw-p /proc/${PID}/maps | sed -n 's/^\([0-9a-f]*\)-\([0-9a-f]*\) .*$/\1 \2/p' | while read start stop; \
do \
gdb --batch --pid ${PID} -ex "dump memory ${PID}-$start-$stop.dump 0x$start 0x$stop"; \
done
$ strings 23756-7f6f6c000000-7f6f70000000.dump| grep 'AAAA'
# ...
AAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHVn
AAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHXp
AAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHAAAAABBBBBCCCCCDDDDDEEEEEGGGGGHHHHHFn
待到添加上 ReleaseStringUTFChars
,申请到内存得到释放,确实解决了这个问题。
总结
定位问题确实看起来不难,但这种异构代码,真正难点在于定位问题。这期间走了不少的弯路,也无法确定究竟是对是错。毕竟问题的原因多种多样:
- 可能是 Java 代码用了诸如 Zip 等容易出现堆外内存泄露的方法
- 不当使用 JNI (就像是上文描述的问题)
- 再有就是第三方 C 库使用不当,或者其本身存在 BUG。
这是第一次排查异构代码的问题,颇多难以分辨,无法做定论之事。
再就是 C 的平台相关性实在是太过于严重了,加之本就不太熟悉,搞个编译新的库文件就好一番折腾。
__ __
/ _| __ _ _ __ __ _ / _| ___ _ __ __ _
| |_ / _` | '_ \ / _` | |_ / _ \ '_ \ / _` |
| _| (_| | | | | (_| | _| __/ | | | (_| |
|_| \__,_|_| |_|\__, |_| \___|_| |_|\__, |
|___/ |___/