新聞中心
某Team在用Python開發(fā)一些代碼,涉及子進程以及設法消除僵尸進程的需求。實踐中他們碰上Python程序非預期退出的現(xiàn)象。最初他們決定 用GDB調試Python解釋器,查看exit()的源頭。我聽了之后,覺得這個問題應該用別的調試思路。幫他們排查這次程序故障時,除去原始問題,還衍 生了其他問題。

這次的問題相比西安研發(fā)中心曾經(jīng)碰上的Python信號處理問題,有不少基礎知識、先驗知識是共用的,此處不做再普及,感興趣的同學可以翻看我以前發(fā)過的文章。
下文是一次具體的調試、分析記錄。為了簡化現(xiàn)場、方便調試,已將原始問題、衍生問題濃縮成DebugPythonWithGDB_6.py、DebugPythonWithGDB_7.py。
$ vi DebugPythonWithGDB_6.py
PHP
#!/usr/bin/env python# -*- encoding: utf-8 -*-import sys, os, signal, subprocess, shlex, tracebackdef on_SIGCHLD ( signum, frame ) :print "[on_SIGCHLD"sys.stdout.write( "signum = %u\n" % signum )traceback.print_stack( frame )print os.waitpid( -1, os.WNOHANG )"""try :print os.waitpid( -1, os.WNOHANG )except OSError :sys.stdout.write( 'Line[%u]: OSError\n' % sys.exc_info()[2].tb_lineno )"""print "on_SIGCHLD]"def do_more ( count ) :print '[do_more() begin %u]' % countos.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )"""## 這里存在競爭條件,可以增加觸發(fā)OSError異常的概率#os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )"""print '[do_more() end %u]' % countdef main ( prog, args ) :if 0 == len( args ) :print 'Usage: %s ' % progelse :sys.stdout.write( "Parent = %u\n" % os.getpid() )## 本例中,即使有下列代碼,Ctrl-C仍然無效。#signal.signal( signal.SIGINT, signal.SIG_DFL )## signal.signal( signal.SIGCHLD, signal.SIG_IGN )#signal.signal( signal.SIGCHLD, on_SIGCHLD )#count = 0while True :## 本例中父進程只是一個調度框架,不需要與子進程進行通信,因此不# 需要特別處理"stdin=None, stdout=None, stderr=None"。#child = subprocess.Popen \(## 不要直接用args[0].split(),它在處理單、雙引號時不是我們# 期望的行為??紤]這種例子,ls -l "/tmp/non exist"#shlex.split( args[0] ),## all file descriptors except 0, 1 and 2 will be closed# before the child process is executed#close_fds = True,cwd = "/tmp")sys.stdout.write( "Child = %u\n" % child.pid )## child.send_signal( signal.SIGTERM )# child.terminate()#child.kill()## child.wait()#do_more( count )count += 1if '__main__' == __name__ :try :main( os.path.basename( sys.argv[0] ), sys.argv[1:] )except KeyboardInterrupt :pass
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | #!/usr/bin/env python # -*- encoding: utf-8 -*- import sys, os, signal, subprocess, shlex, traceback def on_SIGCHLD ( signum, frame ) : print "[on_SIGCHLD" sys.stdout.write( "signum = %u\n" % signum ) traceback.print_stack( frame ) print os.waitpid( -1, os.WNOHANG ) """ try : print os.waitpid( -1, os.WNOHANG ) except OSError : sys.stdout.write( 'Line[%u]: OSError\n' % sys.exc_info()[2].tb_lineno ) """ print "on_SIGCHLD]" def do_more ( count ) : print '[do_more() begin %u]' % count os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' ) """ # # 這里存在競爭條件,可以增加觸發(fā)OSError異常的概率 # os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' ) os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' ) os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' ) os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' ) """ print '[do_more() end %u]' % count def main ( prog, args ) : if 0 == len( args ) : print 'Usage: %s ' % prog else : sys.stdout.write( "Parent = %u\n" % os.getpid() ) # # 本例中,即使有下列代碼,Ctrl-C仍然無效。 # signal.signal( signal.SIGINT, signal.SIG_DFL ) # # signal.signal( signal.SIGCHLD, signal.SIG_IGN ) # signal.signal( signal.SIGCHLD, on_SIGCHLD ) # count = 0 while True : # # 本例中父進程只是一個調度框架,不需要與子進程進行通信,因此不 # 需要特別處理"stdin=None, stdout=None, stderr=None"。 # child = subprocess.Popen \ ( # # 不要直接用args[0].split(),它在處理單、雙引號時不是我們 # 期望的行為??紤]這種例子,ls -l "/tmp/non exist" # shlex.split( args[0] ), # # all file descriptors except 0, 1 and 2 will be closed # before the child process is executed # close_fds = True, cwd = "/tmp" ) sys.stdout.write( "Child = %u\n" % child.pid ) # # child.send_signal( signal.SIGTERM ) # child.terminate() # child.kill() # # child.wait() # do_more( count ) count += 1 if '__main__' == __name__ : try : main( os.path.basename( sys.argv[0] ), sys.argv[1:] ) except KeyboardInterrupt : pass |
PHP
$ python DebugPythonWithGDB_6.py 'python -c "import time;time.sleep(3600)"'Parent = 10244Child = 10245[do_more() begin 0][on_SIGCHLDsignum = 17File "DebugPythonWithGDB_6.py", line 81, inmain( os.path.basename( sys.argv[0] ), sys.argv[1:] )File "DebugPythonWithGDB_6.py", line 76, in maindo_more( count )File "DebugPythonWithGDB_6.py", line 20, in do_moreprint '[do_more() begin %u]' % count(10245, 9)on_SIGCHLD]Child = 10246[on_SIGCHLDsignum = 17File "DebugPythonWithGDB_6.py", line 81, inmain( os.path.basename( sys.argv[0] ), sys.argv[1:] )File "DebugPythonWithGDB_6.py", line 76, in maindo_more( count )File "DebugPythonWithGDB_6.py", line 21, in do_moreos.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )Traceback (most recent call last):File "DebugPythonWithGDB_6.py", line 81, inmain( os.path.basename( sys.argv[0] ), sys.argv[1:] )File "DebugPythonWithGDB_6.py", line 76, in maindo_more( count )File "DebugPythonWithGDB_6.py", line 21, in do_moreos.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )File "DebugPythonWithGDB_6.py", line 10, in on_SIGCHLDprint os.waitpid( -1, os.WNOHANG )OSError: [Errno 10] No child processes
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | $ python DebugPythonWithGDB_6.py 'python -c "import time;time.sleep(3600)"' Parent = 10244 Child = 10245 [do_more() begin 0] [on_SIGCHLD signum = 17 File "DebugPythonWithGDB_6.py", line 81, in main( os.path.basename( sys.argv[0] ), sys.argv[1:] ) File "DebugPythonWithGDB_6.py", line 76, in main do_more( count ) File "DebugPythonWithGDB_6.py", line 20, in do_more print '[do_more() begin %u]' % count (10245, 9) on_SIGCHLD] Child = 10246 [on_SIGCHLD signum = 17 File "DebugPythonWithGDB_6.py", line 81, in main( os.path.basename( sys.argv[0] ), sys.argv[1:] ) File "DebugPythonWithGDB_6.py", line 76, in main do_more( count ) File "DebugPythonWithGDB_6.py", line 21, in do_more os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' ) Traceback (most recent call last): File "DebugPythonWithGDB_6.py", line 81, in main( os.path.basename( sys.argv[0] ), sys.argv[1:] ) File "DebugPythonWithGDB_6.py", line 76, in main do_more( count ) File "DebugPythonWithGDB_6.py", line 21, in do_more os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' ) File "DebugPythonWithGDB_6.py", line 10, in on_SIGCHLD print os.waitpid( -1, os.WNOHANG ) OSError: [Errno 10] No child processes |
流程進入on_SIGCHLD(),但os.waitpid()拋出OSError異常。幫助里寫的是,如果系統(tǒng)調用 waitpid()返回-1,就拋出異常: An OSError is raised with the value of errno when the syscall returns -1. 10245號子進程在on_SIGCHLD()里waitpid()成功,(10245, 9)中的9表示該進程是被SIGKILL干掉的,符合預期。 10246號子進程是do_more()里的os.system()產生的shell進程,它結束時向10244號父進程投遞了SIGCHLD信號。 on_SIGCHLD()里waitpid()時,已經(jīng)在別處wait*()過,10246號子進程已經(jīng)徹底消失,系統(tǒng)調用waitpid()返回 -1,Python函數(shù)os.waitpid()拋出異常。 整個過程非常復雜,用偽代碼描述如下:
PHP
do_more()os.system()posix_system() // posixmodule.c__libc_system() // weak_alias (__libc_system, system)do_system() // sysdeps/posix/system.c/** SIG_IGN** Ctrl-C暫時失效*/sigaction( SIGINT, &sa, &intr )/** 屏蔽(阻塞)SIGCHLD信號*/sigaddset( &sa.sa_mask, SIGCHLD )sigprocmask( SIG_BLOCK, &sa.sa_mask, &omask )fork()子進程(10246號子進程)/** 恢復原有SIGINT信號處理方式*/sigaction( SIGINT, &intr, (struct sigaction *)NULL )/** 調用"sh -c ..."*/execve()[shell子進程結束,向DebugPythonWithGDB_6.py投遞SIGCHLD][由于SIGCHLD信號已被屏蔽(阻塞),其保持在內核態(tài)的未決信號鏈上]父進程(10244號父進程)/** 同步調用,會阻塞。不是在信號句柄中異步調用。** 10246號子進程在此被wait*()回收后徹底消失*/waitpid( pid, &status, 0 )/** 恢復原有SIGINT信號處理方式*/sigaction( SIGINT, &intr, (struct sigaction *)NULL )/** 取消對SIGCHLD的屏蔽(阻塞)*/sigprocmask( SIG_SETMASK, &omask, (sigset_t *)NULL )[SIGCHLD信號的屏蔽(阻塞)被取消][DebugPythonWithGDB_6.py的C級信號句柄signal_handler()安排"延遲調用"后返回][DebugPythonWithGDB_6.py的on_SIGCHLD()此時并未得到執(zhí)行,因為built-in函數(shù)os.system()尚未返回]/** built-in函數(shù)os.system()返回后,10244號父進程開始處理"延遲調用",調用* Python級信號句柄。這個SIGCHLD信號是10246號子進程投遞過來的。** DebugPythonWithGDB_6.py的on_SIGCHLD()得到執(zhí)行*/on_SIGCHLD()/** 調用waitpid( -1, &status, WNOHANG ),試圖處理10246號子進程。** 10246號子進程已為前述waitpid( pid, &status, 0 )所處理,此處系統(tǒng)調用* 返回-1,導致os.waitpid()拋出OSError異常。*/os.waitpid( -1, os.WNOHANG )
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | do_more() os.system() posix_system() // posixmodule.c __libc_system() // weak_alias (__libc_system, system) do_system() // sysdeps/posix/system.c /* * SIG_IGN * * Ctrl-C暫時失效 */ sigaction( SIGINT, &sa, &intr ) /* * 屏蔽(阻塞)SIGCHLD信號 */ sigaddset( &sa.sa_mask, SIGCHLD ) sigprocmask( SIG_BLOCK, &sa.sa_mask, &omask ) fork() 子進程(10246號子進程) /* * 恢復原有SIGINT信號處理方式 */ sigaction( SIGINT, &intr, (struct sigaction *)NULL ) /* * 調用"sh -c ..." */ execve() [shell子進程結束,向DebugPythonWithGDB_6.py投遞SIGCHLD] [由于SIGCHLD信號已被屏蔽(阻塞),其保持在內核態(tài)的未決信號鏈上] 父進程(10244號父進程) /* * 同步調用,會阻塞。不是在信號句柄中異步調用。 * * 10246號子進程在此被wait*()回收后徹底消失 */ waitpid( pid, &status, 0 ) /* * 恢復原有SIGINT信號處理方式 */ sigaction( SIGINT, &intr, (struct sigaction *)NULL ) /* * 取消對SIGCHLD的屏蔽(阻塞) */ sigprocmask( SIG_SETMASK, &omask, (sigset_t *)NULL ) [SIGCHLD信號的屏蔽(阻塞)被取消] [DebugPythonWithGDB_6.py的C級信號句柄signal_handler()安排"延遲調用"后返回] [DebugPythonWithGDB_6.py的on_SIGCHLD()此時并未得到執(zhí)行,因為built-in函數(shù)os.system()尚未返回] /* * built-in函數(shù)os.system()返回后,10244號父進程開始處理"延遲調用",調用 * Python級信號句柄。這個SIGCHLD信號是10246號子進程投遞過來的。 * * DebugPythonWithGDB_6.py的on_SIGCHLD()得到執(zhí)行 */ on_SIGCHLD() /* * 調用waitpid( -1, &status, WNOHANG ),試圖處理10246號子進程。 * * 10246號子進程已為前述waitpid( pid, &status, 0 )所處理,此處系統(tǒng)調用 * 返回-1,導致os.waitpid()拋出OSError異常。 */ os.waitpid( -1, os.WNOHANG ) |
整個過程之所以如此復雜,主要是因為Python的信號處理機制比較復雜,讓已經(jīng)非常復雜的Linux信號機制再添變數(shù)。參看:
PHP
《2.50 對Python解釋器進行調試》《22.0 Linux信號機制》
1 2 | 《2.50 對Python解釋器進行調試》 《22.0 Linux信號機制》 |
就本例而言,為了確保DebugPythonWithGDB_6.py不因OSError異常而終止,只需在on_SIGCHLD()中調用os.waitpid()時捕捉OSError異常:
PHP
def on_SIGCHLD ( signum, frame ) :try :print os.waitpid( -1, os.WNOHANG )except OSError :sys.stdout.write( 'Line[%u]: OSError\n' % sys.exc_info()[2].tb_lineno )
1 2 3 4 5 | def on_SIGCHLD ( signum, frame ) : try : print os.waitpid( -1, os.WNOHANG ) except OSError : sys.stdout.write( 'Line[%u]: OSError\n' % sys.exc_info()[2].tb_lineno ) |
前述觀點有些是動態(tài)調試得到,有些是靜態(tài)分析得到。有人可能問了,為什么不攔截Python進程的C級信號句柄,查看SIGCHLD 信號源,以此確認10246號子進程可能被回收兩次?其實我最初也想這么干來著,但這是行不通的,因為Python的C級信號句柄 signal_handler()是那種最原始的單形參信號句柄,不是高大上的三形參信號句柄。 用GDB調試Python解釋器:
PHP
# gdb -q -ex "b *signal_handler" -ex r --args /usr/bin/python2.7-dbg DebugPythonWithGDB_6.py '/usr/bin/python2.7-dbg -c "import time;time.sleep(3600)"'...Breakpoint 1 at 0x8216f2d: file ../Modules/signalmodule.c, line 185.Starting program: /usr/bin/python2.7-dbg DebugPythonWithGDB_6.py /usr/bin/python2.7-dbg\ -c\ \"import\ time\;time.sleep\(3600\)\"[Thread debugging using libthread_db enabled]Using host libthread_db library "/lib/i386-linux-gnu/i686/cmov/libthread_db.so.1".Parent = 10284Child = 10288[do_more() begin 0]Child = 10289Breakpoint 1, signal_handler (sig_num=17) at ../Modules/signalmodule.c:185185 {(gdb) py-bt#10 Frame 0xb7c20034, for file DebugPythonWithGDB_6.py, line 21, in do_more (count=0)os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' )#13 Frame 0xb7cb37dc, for file DebugPythonWithGDB_6.py, line 76, in main (prog='DebugPythonWithGDB_6.py', args=['/usr/bin/python2.7-dbg -c "import time;time.sleep(3600)"'], count=0, child=)do_more( count )#16 Frame 0xb7cbe49c, for file DebugPythonWithGDB_6.py, line 81, in ()main( os.path.basename( sys.argv[0] ), sys.argv[1:] )(gdb) bt 7#0 signal_handler (sig_num=17) at ../Modules/signalmodule.c:185#1#2 0xb7fdcd3c in __kernel_vsyscall ()#3 0xb7db25eb in __sigprocmask (how=how@entry=2, set=0x0, set@entry=0xbffff0d4, oset=oset@entry=0x0) at ../sysdeps/unix/sysv/linux/sigprocmask.c:57#4 0xb7dc2084 in do_system (line=line@entry=0xb7cbf9e4 "printf \"Child = %u\\n\" $$;/bin/sleep 1") at ../sysdeps/posix/system.c:161#5 0xb7dc2380 in __libc_system (line=line@entry=0xb7cbf9e4 "printf \"Child = %u\\n\" $$;/bin/sleep 1") at ../sysdeps/posix/system.c:184#6 0xb7fa9bfb in system (line=0xb7cbf9e4 "printf \"Child = %u\\n\" $$;/bin/sleep 1") at pt-system.c:28(More stack frames follow...)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 | # gdb -q -ex "b *signal_handler" -ex r --args /usr/bin/python2.7-dbg DebugPythonWithGDB_6.py '/usr/bin/python2.7-dbg -c "import time;time.sleep(3600)"' ... Breakpoint 1 at 0x8216f2d: file ../Modules/signalmodule.c, line 185. Starting program: /usr/bin/python2.7-dbg DebugPythonWithGDB_6.py /usr/bin/python2.7-dbg\ -c\ \"import\ time\;time.sleep\(3600\)\" [Thread debugging using libthread_db enabled] Using host libthread_db library "/lib/i386-linux-gnu/i686/cmov/libthread_db.so.1". Parent = 10284 Child = 10288 [do_more() begin 0] Child = 10289 Breakpoint 1, signal_handler (sig_num=17) at ../Modules/signalmodule.c:185 185 { (gdb) py-bt #10 Frame 0xb7c20034, for file DebugPythonWithGDB_6.py, line 21, in do_more (count=0) os.system( r'printf "Child = %u\n" $$;/bin/sleep 1' ) #13 Frame 0xb7cb37dc, for file DebugPythonWithGDB_6.py, line 76, in main (prog='DebugPythonWithGDB_6.py', args=['/usr/bin/python2.7-dbg -c "import time;time.sleep(3600)"'], count=0, child=) do_more( count ) #16 Frame 0xb7cbe49c, for file DebugPythonWithGDB_6.py, line 81, in () main( os.path.basename( sys.argv[0] ), sys.argv[1:] ) (gdb) bt 7 #0 signal_handler (sig_num=17) at ../Modules/signalmodule.c:185 #1 #2 0xb7fdcd3c in __kernel_vsyscall () #3 0xb7db25eb in __sigprocmask (how=how@entry=2, set=0x0, set@entry=0xbffff0d4, oset=oset@entry=0x0) at ../sysdeps/unix/sysv/linux/sigprocmask.c:57 #4 0xb7dc2084 in do_system (line=line@entry=0xb7cbf9e4 "printf \"Child = %u\\n\" $$;/bin/sleep 1") at ../sysdeps/posix/system.c:161 #5 0xb7dc2380 in __libc_system (line=line@entry=0xb7cbf9e4 "printf \"Child = %u\\n\" $$;/bin/sleep 1") at ../sysdeps/posix/system.c:184 #6 0xb7fa9bfb in system (line=0xb7cbf9e4 "printf \"Child = %u\\n\" $$;/bin/sleep 1") at pt-system.c:28 (More stack frames follow...) |
查看#4的system.c:161,這個位置已經(jīng)在waitpid( pid, &status, 0 )之后: sigprocmask( SIG_SETMASK, &omask, (sigset_t *)NULL ) 其作用是取消對SIGCHLD的屏蔽(阻塞)。 此時內存布局如下:
PHP
內存高址方向fpstate // ESP+0x2DC output/x *(struct _fpstate *)($esp+0x2dc)retcode // ESP+0x2D4 x/3i $esp+0x2d4extramask // ESP+0x2D0 x/1wx $esp+0x2d0fpstate_unused // ESP+0x60 output/x *(struct _fpstate *)($esp+0x60)sigcontext_ia32 // ESP+8 output/x *(struct sigcontext *)($esp+8)sig // ESP+4 信號值,信號句柄***形參pretcode // ESP RetAddr=__kernel_sigreturn// hexdump $esp 0x2dc內存低址方向
1 2 3 4 5 6 7 8 9 10 11 12 | 內存高址方向 fpstate // ESP+0x2DC output/x *(struct _fpstate *)($esp+0x2dc) retcode // ESP+0x2D4 x/3i $esp+0x2d4 extramask // ESP+0x2D0 x/1wx $esp+0x2d0 fpstate_unused // ESP+0x60 output/x *(struct _fpstate *)($esp+0x60) sigcontext_ia32 // ESP+8 output/x *(struct sigcontext *)($esp+8) sig // ESP+4 信號值,信號句柄***形參 pretcode // ESP RetAddr=__kernel_sigreturn // hexdump $esp 0x2dc 內存低址方向 |
PHP
(gdb) x/2wa $esp0xbfffea6c: 0xb7fdcd18 0x11(gdb) x/3i $esp+0x2d40xbfffed40: pop eax0xbfffed41: mov eax,0x770xbfffed46: int 0x80(gdb) output/x *(struct sigcontext *)($esp+8){gs = 0x33,__gsh = 0x0,fs = 0x0,__fsh = 0x0,es = 0x7b,__esh = 0x0,ds = 0x7b,__dsh = 0x0,edi = 0xb7f2a000,esi = 0x8,ebp = 0x1,esp = 0xbfffeff0,ebx = 0x2,edx = 0x0,ecx = 0xbffff0d4,eax = 0x0,trapno = 0x1,err = 0x0,eip = 0xb7fdcd3c,cs = 0x73,__csh = 0x0,eflags = 0x246,esp_at_signal = 0xbfffeff0,ss = 0x7b,__ssh = 0x0,fpstate = 0xbfffed50,oldmask = 0x0,cr2 = 0x0}
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 | (gdb) x/2wa $esp 0xbfffea6c: 0xb7fdcd18 0x11 (gdb) x/3i $esp+0x2d4 0xbfffed40: pop eax 0xbfffed41: mov eax,0x77 0xbfffed46: int 0x80 (gdb) output/x *(struct sigcontext *)($esp+8) { gs = 0x33, __gsh = 0x0, fs = 0x0, __fsh = 0x0, es = 0x7b, __esh = 0x0, ds = 0x7b, __dsh = 0x0, edi = 0xb7f2a000, esi = 0x8, ebp = 0x1, esp = 0xbfffeff0, ebx = 0x2, edx = 0x0, ecx = 0xbffff0d4, eax = 0x0, trapno = 0x1, err = 0x0, eip = 0xb7fdcd3c, cs = 0x73, __csh = 0x0, eflags = 0x246, esp_at_signal = 0xbfffeff0, ss = 0x7b, __ssh = 0x0, fpstate = 0xbfffed50, oldmask = 0x0, cr2 = 0x0 } |
因為是單形參信號句柄,沒有siginfo,無法在用戶態(tài)獲知信號源。但我分析此時的信號源不是10289號子進程,而是10288 號子進程。10288產生SIGCHLD時,SIGCHLD信號已被屏蔽(阻塞),只能保持在內核態(tài)的未決信號鏈上。之后待10289產生SIGCHLD 時,sigpending.signal中相應位已經(jīng)置位,10289產生的SIGCHLD被丟棄,不會進入內核態(tài)的未決信號鏈。SIGCHLD信號的屏 蔽(阻塞)被取消后,從內核態(tài)的未決信號鏈上取出10288產生的SIGCHLD進行處理。于是斷點***。 如果完全理解了前述實驗結果及分析,就會發(fā)現(xiàn)DebugPythonWithGDB_6.py存在競爭條件。subprocess.Popen()對應的 子進程投遞SIGCHLD信號時,父進程有兩種可能:
PHP
1) os.system()調用sigprocmask( SIG_BLOCK, &sa.sa_mask, &omask )之前2) os.system()調用sigprocmask( SIG_BLOCK, &sa.sa_mask, &omask )之后
1 2 | 1) os.system()調用sigprocmask( SIG_BLOCK, &sa.sa_mask, &omask )之前 2) os.system()調用sigprocmask( SIG_BLOCK, &sa.sa_mask, &omask )之后 |
情況1)會觸發(fā)OSError異常,情況2)不會觸發(fā)OSError異常。執(zhí)行: $ python DebugPythonWithGDB_6.py ‘python -c “import time;time.sleep(3600)”‘ 有時會因OSError異常而終止,有時就一直循環(huán)執(zhí)行下去。出現(xiàn)這種差異,正是競爭環(huán)境的表征。 小結一下: 假設針對SIGCHLD安裝了Python級信號句柄,其調用os.waitpid( -1, os.WNOHANG )回收子進程。如果別處會調用os.system(),則必須在os.waitpid()外側捕捉OSError異常。不建議這種方式的混用。 對waitpid()的分析到此就結束了,說點調試過程中出現(xiàn)的其他問題。 意外地發(fā)現(xiàn)Ctrl-C無法終止情況2),而我已經(jīng)調用: signal.signal( signal.SIGINT, signal.SIG_DFL ) 這是因為do_system()中一上來就調用了:
PHP
sa.sa_handler = SIG_IGN;sigaction( SIGINT, &sa, &intr );
1 2 | sa.sa_handler = SIG_IGN; sigaction( SIGINT, &sa, &intr ); |
導致Ctrl-C暫時失效,直至do_system()結束。假設DebugPythonWithGDB_6.py已經(jīng)出 現(xiàn)情況2),查看它的信號處理方式:
PHP
# ps auwx | grep pythonroot 10355 0.0 0.5 8116 5812 pts/0 S+ 15:57 0:00 python DebugPythonWithGDB_6.py python -c "import time;time.sleep(3600)"root 10389 0.0 0.0 0 0 pts/0 Z+ 15:57 0:00 [python]root 10393 0.0 0.0 2936 852 pts/1 R+ 15:57 0:00 grep python# stap -DMAXACTION=10000 -g /usr/share/doc/systemtap-doc/examples/process/psig.stp -x 1035510355: pythonHUP defaultINT ignored // 不是預期的defaultQUIT ignoredILL defaultTRAP defaultABRT defaultBUS defaultFPE defaultKILL defaultUSR1 defaultSEGV defaultUSR2 defaultPIPE ignoredALRM defaultTERM defaultSTKFLT defaultCHLD blocked,caught 0x818a480 0...


咨詢
建站咨詢
