ecg_annotation.py

#import matplotlib with pdf as backend
import matplotlib 
matplotlib.use('PDF')
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

import wfdb 
import os
import numpy as np
import math
import sys
import scipy.stats as st
import glob, os
from os.path import basename


import tensorflow as tf
from keras.layers import Dense,Activation,Dropout
from keras.layers import LSTM,Bidirectional #could try TimeDistributed(Dense(...))
from keras.models import Sequential, load_model
from keras import optimizers,regularizers
from keras.layers.normalization import BatchNormalization
import keras.backend.tensorflow_backend as KTF
np.random.seed(0)

# functions
def get_ecg_data(datfile): 
	## convert .dat/q1c to numpy arrays
	recordname=os.path.basename(datfile).split(".dat")[0]
	recordpath=os.path.dirname(datfile)
	cwd=os.getcwd()
	os.chdir(recordpath) ## somehow it only works if you chdir. 

	annotator='q1c'
	annotation = wfdb.rdann(recordname, extension=annotator, sampfrom=0,sampto = None, pbdir=None)
	Lstannot=list(zip(annotation.sample,annotation.symbol,annotation.aux_note))

	FirstLstannot=min( i[0] for i in Lstannot)
	LastLstannot=max( i[0] for i in Lstannot)-1
	print("first-last annotation:", FirstLstannot,LastLstannot)
	
	record = wfdb.rdsamp(recordname, sampfrom=FirstLstannot,sampto = LastLstannot) #wfdb.showanncodes()
	annotation = wfdb.rdann(recordname, annotator, sampfrom=FirstLstannot,sampto = LastLstannot) ## get annotation between first and last. 
	annotation2 = wfdb.Annotation(recordname='sel32', extension='niek', sample=(annotation.sample-FirstLstannot), symbol = annotation.symbol, aux_note=annotation.aux_note)

	Vctrecord=np.transpose(record.p_signals)
	VctAnnotationHot=np.zeros( (6,len(Vctrecord[1])), dtype=np.int)
	VctAnnotationHot[5]=1 ## inverse of the others 
	#print("ecg, 2 lead of shape" , Vctrecord.shape) 
	#print("VctAnnotationHot of shape" , VctAnnotationHot.shape) 
	#print('plotting extracted signal with annotation')
	#wfdb.plotrec(record, annotation=annotation2, title='Record 100 from MIT-BIH Arrhythmia Database', timeunits = 'seconds')

	VctAnnotations=list(zip(annotation2.sample,annotation2.symbol)) ## zip coordinates + annotations (N),(t) etc)
	#print(VctAnnotations)
	for i in range(len(VctAnnotations)):
		#print(VctAnnotations[i]) # Print to display annotations of an ecg
		try: 
			
			if VctAnnotations[i][1]=="p":
				if VctAnnotations[i-1][1]=="(":
					pstart=VctAnnotations[i-1][0]
				if VctAnnotations[i+1][1]==")":
					pend=VctAnnotations[i+1][0]
				if VctAnnotations[i+3][1]=="N":
					rpos=VctAnnotations[i+3][0]
					if VctAnnotations[i+2][1]=="(":
						qpos=VctAnnotations[i+2][0]
					if VctAnnotations[i+4][1]==")":
						spos=VctAnnotations[i+4][0]	
					for ii in range(0,8): ## search for t (sometimes the "(" for the t  is missing  )
						if VctAnnotations[i+ii][1]=="t":
							tpos=VctAnnotations[i+ii][0]
							if VctAnnotations[i+ii+1][1]==")":
								tendpos=VctAnnotations[i+ii+1][0]
			# 				#print(ppos,qpos,rpos,spos,tendpos)
								VctAnnotationHot[0][pstart:pend]=1 #P segment
								VctAnnotationHot[1][pend:qpos]=1 #part "nothing" between P and Q, previously left unnanotated, but categorical probably can't deal with that
								VctAnnotationHot[2][qpos:rpos]=1 #QR
								VctAnnotationHot[3][rpos:spos]=1 #RS
								VctAnnotationHot[4][spos:tendpos]=1 #ST (from end of S to end of T)
								VctAnnotationHot[5][pstart:tendpos]=0 #tendpos:pstart becomes 1, because it is inverted above					
		except IndexError:
			pass
	
	Vctrecord=np.transpose(Vctrecord) # transpose to (timesteps,feat)
	VctAnnotationHot=np.transpose(VctAnnotationHot)
	os.chdir(cwd)
	return Vctrecord, VctAnnotationHot


def splitseq(x,n,o):
	#split seq; should be optimized so that remove_seq_gaps is not needed. 
	upper=math.ceil( x.shape[0] / n) *n
	print("splitting on",n,"with overlap of ",o,	"total datapoints:",x.shape[0],"; upper:",upper)
	for i in range(0,upper,n):
		#print(i)
		if i==0:
			padded=np.zeros( ( o+n+o,x.shape[1])   ) ## pad with 0's on init
			padded[o:,:x.shape[1]] = x[i:i+n+o,:]
			xpart=padded
		else:
			xpart=x[i-o:i+n+o,:]
		if xpart.shape[0]<i:

			padded=np.zeros( (o+n+o,xpart.shape[1])  ) ## pad with 0's on end of seq
			padded[:xpart.shape[0],:xpart.shape[1]] = xpart
			xpart=padded

		xpart=np.expand_dims(xpart,0)## add one dimension; so that you get shape (samples,timesteps,features)
		try:
			xx=np.vstack(  (xx,xpart) )
		except UnboundLocalError: ## on init
			xx=xpart
	print("output: ",xx.shape)
	return(xx)

def remove_seq_gaps(x,y):
	#remove parts that are not annotated <- not ideal, but quickest for now.
	window=150
	c=0
	cutout=[]
	include=[]
	print("filterering.")
	print("before shape x,y",x.shape,y.shape)
	for i in range(y.shape[0]):
		
		c=c+1
		if c<window :
			include.append(i)
		if sum(y[i,0:5])>0:
			c=0 
		if c >= window:
			#print ('filtering')
			pass
	x,y=x[include,:],y[include,:]
	print(" after shape x,y",x.shape,y.shape)
	return(x,y)


def normalizesignal(x):
	x=st.zscore(x, ddof=0)
	return x
def normalizesignal_array(x):
	for i in range(x.shape[0]):
		x[i]=st.zscore(x[i], axis=0, ddof=0)
	return x

def plotecg(x,y,begin,end):
	#helper to plot ecg
	plt.figure(1,figsize=(11.69,8.27))
	plt.subplot(211)
	plt.plot(x[begin:end,0])
	plt.subplot(211)
	plt.plot(y[begin:end,0])
	plt.subplot(211)
	plt.plot(y[begin:end,1])
	plt.subplot(211)
	plt.plot(y[begin:end,2])
	plt.subplot(211)
	plt.plot(y[begin:end,3])
	plt.subplot(211)
	plt.plot(y[begin:end,4])
	plt.subplot(211)
	plt.plot(y[begin:end,5])

	plt.subplot(212)
	plt.plot(x[begin:end,1])
	plt.show()

def plotecg_validation(x,y_true,y_pred,begin,end):
	#helper to plot ecg
	plt.figure(1,figsize=(11.69,8.27))
	plt.subplot(211)
	plt.plot(x[begin:end,0])
	plt.subplot(211)
	plt.plot(y_pred[begin:end,0])
	plt.subplot(211)
	plt.plot(y_pred[begin:end,1])
	plt.subplot(211)
	plt.plot(y_pred[begin:end,2])
	plt.subplot(211)
	plt.plot(y_pred[begin:end,3])
	plt.subplot(211)
	plt.plot(y_pred[begin:end,4])
	plt.subplot(211)
	plt.plot(y_pred[begin:end,5])

	plt.subplot(212)
	plt.plot(x[begin:end,1])
	plt.subplot(212)
	plt.plot(y_true[begin:end,0])
	plt.subplot(212)
	plt.plot(y_true[begin:end,1])
	plt.subplot(212)
	plt.plot(y_true[begin:end,2])
	plt.subplot(212)
	plt.plot(y_true[begin:end,3])
	plt.subplot(212)
	plt.plot(y_true[begin:end,4])
	plt.subplot(212)
	plt.plot(y_true[begin:end,5])
	
def LoaddDatFiles(datfiles):  
	for datfile in datfiles:
	    print(datfile)
	    if basename(datfile).split(".",1)[0] in exclude:
	    	continue
	    qf=os.path.splitext(datfile)[0]+'.q1c'
	    if os.path.isfile(qf):
	    	#print("yes",qf,datfile)
	    	x,y=get_ecg_data(datfile)
	    	x,y=remove_seq_gaps(x,y)

	    	x,y=splitseq(x,1000,150),splitseq(y,1000,150) ## create equal sized numpy arrays of n size and overlap of o 

	    	x = normalizesignal_array(x)
	    	## todo; add noise, shuffle leads etc. ?
	    	try: ## concat
	    		xx=np.vstack(  (xx,x) )
	    		yy=np.vstack(  (yy,y) )
	    	except NameError: ## if xx does not exist yet (on init)
	    		xx = x
	    		yy = y
	return(xx,yy)

def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

def get_session(gpu_fraction=0.8):
	#allocate % of gpu memory.
	num_threads = os.environ.get('OMP_NUM_THREADS')
	gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=gpu_fraction)
	if num_threads:
		return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options, intra_op_parallelism_threads=num_threads))
	else:
		return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

def getmodel():
	model = Sequential()
	model.add(Dense(32,W_regularizer=regularizers.l2(l=0.01), input_shape=(seqlength, features)))
	model.add(Bidirectional(LSTM(32, return_sequences=True)))#, input_shape=(seqlength, features)) ) ### bidirectional ---><---
	model.add(Dropout(0.2))
	model.add(BatchNormalization())
	model.add(Dense(64, activation='relu',W_regularizer=regularizers.l2(l=0.01)))
	model.add(Dropout(0.2))
	model.add(BatchNormalization())
	model.add(Dense(dimout, activation='softmax'))
	adam = optimizers.adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
	model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy']) 
	print(model.summary())
	return(model)

##################################################################
##################################################################
qtdbpath=sys.argv[1] ## first argument = qtdb database from physionet. 
perct=0.81 #percentage training
percv=0.19 #percentage validation

exclude = set()
exclude.update(["sel35","sel36","sel37","sel50","sel102","sel104","sel221","sel232", "sel310"])# no P annotated:
##################################################################
# datfile=qtdbpath+"sel49.dat"  ## single ECG to test if loading works.  
# x,y=get_ecg_data(datfile)
# print(x.shape,y.shape)
# # for i in range(y.shape[0]): #Invert QT-label to actually represent QT. Does give overlapping labels
# # 	y[i][0] = 1 - y[i][0]
# plotecg(x,y,0,y.shape[0]) ## plot all
# x,y=remove_seq_gaps(x,y) ## remove 'annotation gaps'
# plotecg(x,y,0,y.shape[0]) ## plot all
# x,y=splitseq(x,750,150),splitseq(y,750,150) ## create equal sized numpy arrays of n size and overlap of o 
# exit()
##################################################################

# load data
datfiles=glob.glob(qtdbpath+"*.dat")
xxt,yyt=LoaddDatFiles(datfiles[ :round(len(datfiles)*perct) ]) # training data. 
xxt,yyt=unison_shuffled_copies(xxt,yyt) ### shuffle
xxv,yyv=LoaddDatFiles(datfiles[ -round(len(datfiles)*percv): ] ) ## validation data.
seqlength=xxt.shape[1]
features=xxt.shape[2]
dimout=yyt.shape[2]
print("xxv/validation shape: {}, Seqlength: {}, Features: {}".format(xxv.shape[0],seqlength,features))
# #plot validation ecgs 
# with PdfPages('ecgs_xxv.pdf') as pdf:
# 	for i in range( xxv.shape[0] ): 
# 		print (i)
# 		plotecg(xxv[i,:,:],yyv[i,:,:],0,yyv.shape[1])
# 		pdf.savefig()
# 		plt.close()

# call keras/tensorflow and build lstm model 
KTF.set_session(get_session())
with tf.device('/cpu:0'): #switch to /cpu:0 to use cpu 
	if not os.path.isfile('model.h5'):
		model = getmodel() # build model
		model.fit(xxt, yyt, batch_size=32, epochs=100, verbose=1) # train the model
		model.save('model.h5')

	model = load_model('model.h5')
	score, acc = model.evaluate(xxv, yyv, batch_size=4, verbose=1)
	print('Test score: {} , Test accuracy: {}'.format(score, acc))
	
	# predict
	yy_predicted = model.predict(xxv) 

	# maximize probabilities of prediction.
	for i in range(yyv.shape[0]): 
		b = np.zeros_like(yy_predicted[i,:,:])
		b[np.arange(len(yy_predicted[i,:,:])), yy_predicted[i,:,:].argmax(1)] = 1
		yy_predicted[i,:,:] = b

	# plot: 
	with PdfPages('ecg.pdf') as pdf:
		for i in range( xxv.shape[0] ): 
			print (i)
			plotecg_validation(xxv[i,:,:],yy_predicted[i,:,:],yyv[i,:,:],0,yy_predicted.shape[1])  # top = predicted, bottom=true
			pdf.savefig()
			plt.close()

	#plotecg(xv[1,:,:],yv[1,:,:],0,yv.shape[1]) ## plot first seq