Tensorflow中的不确定行为和输出行为

问题描述

我在使用Tensorflow(2.3)执行自定义操作时编写的代码段遇到了问题。该代码通常可以正常工作,但有时即使在其他具有相同输入的执行中也可以正常工作,有时也会引发意外的错误和异常。

我试图解决该问题,并且我几乎确信这是一个评估依赖问题。我试图添加一些依赖项控件,但这没有用。我为一点点冗长的代码道歉,实际上我无法在一个较小的示例中重现该问题。下面是我的代码

  import tensorflow.compat.v1 as tf
tf.compat.v1.disable_eager_execution()
tf.disable_v2_behavior()


myTensor_values = tf.placeholder(dtype=tf.float32)
myTensor_l2_splits = tf.placeholder(dtype=tf.int32)
myTensor_l1_splits = tf.placeholder(dtype=tf.int32)


def innerloop_processing(begin_index,end_index,input1) : 
    innerloop_counter = begin_index
    ta = tf.TensorArray(tf.float32,size=0,dynamic_size=True,clear_after_read=False,infer_shape=False )
    def innerloop_body(counter,begin_index,input1,ta) : 
        inner_being_index = input1[1][counter]
        inner_end_index = input1[1][counter+1]
        row = tf.slice(input1[0],[inner_being_index],[inner_end_index-inner_being_index])
        ta = ta.write(counter-begin_index,row)
        counter = counter + 1 
        return counter,ta
    
    
    def innerloop_cond(counter,ta ) : 
        return input1[1][counter] < input1[1][end_index] -1  #stop at the next pointer of the l2_splits 
 
    results = tf.while_loop(innerloop_cond,innerloop_body,[innerloop_counter,ta] )
    print_resutls = tf.print("this is the component result  :",results[4].stack())
    return results[4].stack()


def generateL1Tensor_writeback(start_offest,step,num):
    counter=tf.constant(0,tf.int32)
    values = tf.TensorArray(tf.int32,infer_shape=False )
    def cond(values,start_offest,num,counter) : 
        return counter*step <= num*step
    def body(values,counter) : 
        values = values.write(counter,[(counter*step)+start_offest])
        counter = counter+1
        return  values,counter
    
    final_values,_,_  = tf.while_loop(cond,body,[values,counter])
    final = final_values.concat()
    #print_line = tf.print(" xxxxx This is the is the split : ",final)
    return final

def multiply2n_ragged(tensor1,tensor2) : 
    #this  function multiplies two ragged tesnsors of rank 2 . the most outer ranks of the two tensros must be equal .
    #setting variables and constats 
    outerloop_counter = tf.constant(0,dtype=tf.int32)
    carry_on = tf.constant(0,dtype=tf.int32)
    taValues = tf.TensorArray(tf.float32,infer_shape=False )
    taL2Splits = tf.TensorArray(tf.int32,infer_shape=False )
    taL1Splits = tf.TensorArray(tf.int32,infer_shape=False )
    taL1Splits = taL1Splits.write(0,[0]) ## required intialization for L1 split only
    innerloop_processing_graphed = tf.function(innerloop_processing)
    generateL1Tensor_writeback_graphed = tf.function(generateL1Tensor_writeback)
    def outerloop_cond(counter,input2,taValues,taL2Splits,taL1Splits,carry_on ) :
        value = tf.shape(input1[2])[0]-1
        return counter < value ## this is the length of the outermost dimision,stop of this 
    def outloop_body(counter,carry_on) : 
        l1_comp_begin = input1[2][counter]                  ## this is begin position of the current row in the outer split  ( ie. the ith value in the outer row split tensor ) 
        l1_comp_end = input1[2][counter+1]                  ## this is end position of the current row in the outer split   (ie. the ith + 1 value in the outer row split tensor)
        l1_comp2_begin = input2[2][counter]                 ## we do the same for the second components 
        l1_comp2_end = input2[2][counter+1]                 ## we do the same for the second components
        comp  = innerloop_processing_graphed(l1_comp_begin,l1_comp_end,input1  ) ## Now retrive the data to be procesed for the selected rows from vector1
        comp2  =innerloop_processing_graphed(l1_comp2_begin,l1_comp2_end,input2  ) ## do the same for vector 2 
        
        comp2 = tf.transpose(comp2) ### desired operation
        multiply =tf.matmul(comp,comp2) #### This is the desired operation  

        
        myshape= tf.shape(multiply) ## calculate the shape of the result in order to prepare to write the result in a ragged tensor format. 
        offset = tf.cond( taValues.size() >0,lambda: tf.shape(taValues.concat())[0],lambda : [0]) ### this is a hack,TensorArray.concat returns an error if the array is empty. Thus we check before calling this. 
        l2v = generateL1Tensor_writeback_graphed(offset,myshape[1],myshape[0])  # generate the inner row split of the result for the current element
        taL2Splits=taL2Splits.write(counter,l2v) # write back the inner rowlplit to a TensorArray 
        taValues=taValues.write(counter,tf.reshape(multiply,[-1])) # wirte back the actual ragged tensor elemnts in a another TensorArray
        carry_on=carry_on+myshape[0] ## required to calculate the outer row splite
        taL1Splits=taL1Splits.write(counter+1,[carry_on]) ## This is the outmost row split. 
        counter = counter+1
        return counter,carry_on
    
    outerloop_finalcounter,ta1,ta2,ta3,_ = tf.while_loop(outerloop_cond,outloop_body,[outerloop_counter,tensor1,tensor2,carry_on])
    uinquie_ta2,_ = tf.unique(ta2.concat())  # this is required since some values might be duplicate in the row split itself 
    final_values = ta1.concat(),uinquie_ta2,ta3.concat()
    return final_values




t = myTensor_values,myTensor_l2_splits,myTensor_l1_splits

oo   =multiply2n_ragged(t,t)
new_oo = multiply2n_ragged(oo,oo)


sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)))
sess.run(tf.global_variables_initializer())
vals =np.array([1.0,2.2,1.1,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,1.1 ])
l2_splits = np.array([0,3,6,9,12,15])
l1_splits = np.array([0,2,5  ]) 
re       = sess.run([new_oo  ],Feed_dict={myTensor_values:vals,myTensor_l1_splits:l1_splits,myTensor_l2_splits:l2_splits  } )
print(re)

正如我所说,该代码可以多次正常运行,但是对于相同的输入,有时会产生以下错误。我得到的一些不同错误的堆栈痕迹:

this is the component result  : [[1 2.2 1.1]
 [4 5 1.1]]
this is the component result  : [[1 2.2 1.1]
 [4 5 1.1]]
this is the component result  : [[6 7 1.1]
 [8 9 1.1]
 [10 11 1.1]]
this is the component result  : [[6 7 1.1]
 [8 9 1.1]
 [10 11 1.1]]
this is the component result  : [[7.05 16.21]
 [16.21 42.21]]
this is the component result  : [[7.05 16.21]
 [16.21 42.21]]
---------------------------------------------------------------------------
InvalidArgumentError                      Traceback (most recent call last)
C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in _do_call(self,fn,*args)
   1364     try:
-> 1365       return fn(*args)
   1366     except errors.OpError as e:

C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in _run_fn(Feed_dict,fetch_list,target_list,options,run_Metadata)
   1349       return self._call_tf_sessionrun(options,Feed_dict,-> 1350                                       target_list,run_Metadata)
   1351 

C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in _call_tf_sessionrun(self,run_Metadata)
   1442                                             fetch_list,-> 1443                                             run_Metadata)
   1444 

InvalidArgumentError: {{function_node __inference_innerloop_processing_13658}} {{function_node __inference_innerloop_processing_13658}} Expected size[0] in [0,0],but got 3
     [[{{node while/body/_1/while/Slice}}]]
     [[while_33/StatefulPartitionedCall_1]]

During handling of the above exception,another exception occurred:

InvalidArgumentError                      Traceback (most recent call last)
<ipython-input-18-238a2ce9a03a> in <module>
     94 l2_splits = np.array([0,15])
     95 l1_splits = np.array([0,5  ])
---> 96 re       = sess.run([new_oo  ],myTensor_l2_splits:l2_splits  } )
     97 print(re)

C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in run(self,fetches,run_Metadata)
    956     try:
    957       result = self._run(None,options_ptr,--> 958                          run_Metadata_ptr)
    959       if run_Metadata:
    960         proto_data = tf_session.TF_GetBuffer(run_Metadata_ptr)

C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in _run(self,handle,run_Metadata)
   1179     if final_fetches or final_targets or (handle and Feed_dict_tensor):
   1180       results = self._do_run(handle,final_targets,final_fetches,-> 1181                              Feed_dict_tensor,run_Metadata)
   1182     else:
   1183       results = []

C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in _do_run(self,run_Metadata)
   1357     if handle is None:
   1358       return self._do_call(_run_fn,Feeds,targets,-> 1359                            run_Metadata)
   1360     else:
   1361       return self._do_call(_prun_fn,fetches)

C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in _do_call(self,*args)
   1382                     '\nsession_config.graph_options.rewrite_options.'
   1383                     'disable_Meta_optimizer = True')
-> 1384       raise type(e)(node_def,op,message)
   1385 
   1386   def _extend_graph(self):

InvalidArgumentError:   Expected size[0] in [0,but got 3
     [[{{node while/body/_1/while/Slice}}]]
     [[while_33/StatefulPartitionedCall_1]]

以及以下错误

CancelledError                            Traceback (most recent call last)
C:\ProgramData\Anaconda3\envs\AutoEncoder\lib\site-packages\tensorflow\python\client\session.py in _do_call(self,-> 1443                                             run_Metadata)
   1444 

CancelledError: {{function_node __inference_innerloop_processing_11240}} {{function_node __inference_innerloop_processing_11240}} [_Derived_]Loop execution was cancelled.
     [[{{node while/LoopCond/_20}}]]
     [[while_27/StatefulPartitionedCall_1]]

During handling of the above exception,another exception occurred:

CancelledError                            Traceback (most recent call last)
<ipython-input-15-238a2ce9a03a> in <module>
     94 l2_splits = np.array([0,message)
   1385 
   1386   def _extend_graph(self):

CancelledError:   [_Derived_]Loop execution was cancelled.
     [[{{node while/LoopCond/_20}}]]
     [[while_27/StatefulPartitionedCall_1]]

我相信所有错误都将抛出innerloop_processing中。我还在Tensorflow github here中打开了一个问题。

解决方法

问题似乎出自 tf.Cond,幸运的是在 tensorflow2 中重新实现了这一点。从而删除调用:

tf.disable_v2_behavior()

解决问题。