给定输入,每个RNNcell使用sigmoid函数得到隐藏状态向量。 然后我们使用隐藏状态来预测每个时间步的输出:
为了训练模型,我们对每个预测的标记使用交叉熵损失:
(a) i. RNN模型: 有 个参数, 有 个参数
基于window的模型: 有 个参数
ii. 预测长T的句子标签的时间复杂度:
:::(b) 很难直接对F1进行优化
F1不可微分需要从整个语料库预测来计算,这使批处理和并行化非常困难(c) 在q2_rnn_cell.py实现RNN cell,并运行
class RNNCell(tf.nn.rnn_cell.RNNCell): def __init__(self,input_size,state_size): self.input_size=input_size self._state_size=state_size @property def state_size(self): return self._state_size @property def output_size(self): return self._state_size def __call__(self, inputs, state,scope=None): "TODO: In the code below, implement an RNN cell using @inputs" scope=scope or type(self).__name__ with tf.variable_scope(scope): W_x=tf.get_variable('W_x',shape=(self.input_size,self._state_size),dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) W_h=tf.get_variable('W_h',shape=(self._state_size,self._state_size),dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) b=tf.get_variable('b',shape=(self._state_size),dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) new_state=tf.nn.sigmoid(tf.matmul(state,W_h)+tf.matmul(inputs,W_x)+b) output=new_state return output,new_state def test_rnn_cell(): with tf.Graph().as_default(): with tf.variable_scope("test_rnn_cell"): x_placeholder=tf.placeholder(tf.float32,shape=(None,3)) h_placeholder=tf.placeholder(tf.float32,shape=(None,2)) with tf.variable_scope("rnn"): tf.get_variable("W_x",initializer=np.array(np.eye(3,2),dtype=np.float32)) tf.get_variable("W_h",initializer=np.array(np.eye(2,2),dtype=np.float32)) tf.get_variable("b",initializer=np.array(np.ones(2),dtype=np.float32)) tf.get_variable_scope().reuse_variables() cell=RNNCell(3,2) y_var,ht_var=cell(x_placeholder,h_placeholder,scope="rnn") init=tf.global_variables_initializer() with tf.Session() as session: session.run(init) x=np.array([ [0.4,0.5,0.6], [0.3,-0.2,-0.1]],dtype=np.float32) h=np.array([ [0.2,0.5], [-0.3,-0.3]],dtype=np.float32) y=np.array([ [0.832,0.881], [0.731,0.622]],dtype=np.float32) ht=y y_,ht_=session.run([y_var,ht_var],feed_dict={x_placeholder:x,h_placeholder:h}) print("y_ = "+str(y_)) print("ht_ = "+str(ht_)) assert np.allclose(y_,ht_), "output and state should be equal." assert np.allclose(ht,ht_,atol=1e-2),"new state vector does not seem to be correct."(d) 实现RNN要求我们在整个句子上展开计算。 但是每个句子可以任意长度,这导致RNN针对不同句子展开不同次数。使得不可能批量处理数据。解决此问题的最常见方法是使用零填充输入,假设我们输入的句子最大长度为M,对长度为T的输入:
将“0-vectors”添加到x和y以使它们长为M创建mask vector ,其中为1,为0损失函数:i. 通过掩盖损失,我们将由于这些额外的 0-labels(以及它们的梯度)引起的损失归零
ii. 实现pad_sequence并运行验证
def pad_sequences(data,max_length): "TODO: In the code below, for every sentence, labels pair in @data" ret=[] # Use this zero vector when padding sequences. zero_vector=[0]*Config.n_features zero_label=4 for sentence,labels in data: len_senquence=len(sentence) add_length=max_length-len_senquence if add_length>0: filled_sentence=sentence+([zero_vector]*add_length) filled_labels=labels+([zero_label]*add_length) mark=[True]*len_senquence mark.extend([False]*add_length) else: mark=[True]*max_length filled_sentence=sentence[:max_length] filled_labels=labels[:max_length] ret.append((filled_sentence,filled_labels,mark)) return ret(e) RNN模型的实现
实现add_placeholders, add_embedding, add_training_op函数实现add_prediction_op操作,展开RNN循环self.max_length次。从第2步开始,在变量范围内重用变量,共享权重和实现add_loss_op处理上一部分返回的mask vector class RNNModel(NERModel): def add_placeholders(self): "TODO: Add these placeholders to self as the instance variables" self.input_placeholder=tf.placeholder( tf.int32,shape=(None,self.max_length,Config.n_features),name='input') self.lables_placeholder=tf.placeholder( tf.int32,shape=(None,self.max_length),name='labels') self.mask_placeholder=tf.placeholder( tf.bool,shape=(None,self.max_length),name='mask') self.dropout_placeholder=tf.placeholder( tf.float32,name='dropout') def create_feed_dict(self, inputs_batch, mask_batch, labels_batch=None, dropout=1): feed_dict={ self.input_placeholder:inputs_batch, self.mask_placeholder:mask_batch, self.dropout_placeholder:dropout } if labels_batch is not None: feed_dict[self.lables_placeholder]=labels_batch return feed_dict def add_embedding(self): """Adds an embedding layer that maps from input tokens (integers) to vectors and then concatenates those vectors""" embeddings=tf.nn.embedding_lookup( tf.Variable(self.pretrained_embeddings), self.input_placeholder) embeddings=tf.reshape( embeddings,[-1,self.max_length,Config.n_features*Config.embed_size]) return embeddings def add_prediction_op(self): """Adds the unrolled RNN: h_0=0 for t in 1 to T: o_t,h_t=cell(x_t,h_{t-1}) o_drop_t=Dropout(o_t,dropout_rate) y_t=o_drop_t U+b_2""" x=self.add_embedding() dropout_rate=self.dropout_placeholder preds=[] if self.config.cell=="rnn": cell=RNNCell(Config.n_features*Config.embed_size,Config.hidden_size) else: raise ValueError("Unsupported cell type"+self.config.cell) with tf.variable_scope('Layer1'): U=tf.get_variable('U',(Config.hidden_size,Config.n_classes),initializer=tf.contrib.layers.xavier_initializer()) b2=tf.get_variable('b2',(Config.n_classes),initializer=tf.constant_initializer(0)) input_shape=tf.shape(x) state=tf.zeros((input_shape[0],Config.hidden_size)) with tf.variable_scope("RNN"): for time_step in range(self.max_length): if time_step>0: tf.get_variable_scope().reuse_variables() o,state=cell(x[:,time_step,:],state,scope="RNN") o_drop=tf.nn.dropout(o,dropout_rate) output=tf.matmul(o_drop,U)+b2 preds.append(output) preds=tf.stack(preds,axis=1) assert preds.get_shape().as_list()==[None,self.max_length,self.config.n_classes],\ "predictions are not of the right shape. Expected {}, got {}"\ .format([None,self.max_length,self.config.n_classes],preds.get_shape().as_list()) return preds def add_loss_op(self, preds): """Adds Ops for the loss function to the computational graph.""" masked_logits=tf.boolean_mask(preds,self.mask_placeholder) masked_labels=tf.boolean_mask(self.lables_placeholder,self.mask_placeholder) loss=tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=masked_logits, labels=masked_labels) ) return loss def add_training_op(self, loss): train_op=tf.train.AdamOptimizer(Config.lr).minimize(loss) return train_op def preprocess_sequence_data(self, examples): def featurize_windows(data, start, end, window_size=1): """Use the input sequences in @data to construct new windowed data points.""" ret=[] for sentence,labels in data: from util import window_iterator sentence_=[] for window in window_iterator(sentence,window_size,beg=start,end=end): sentence_.append(sum(window,[])) ret.append((sentence_,labels)) return ret examples=featurize_windows(examples,self.helper.START,self.helper.END) return pad_sequences(examples,self.max_length) def consolidate_predictions(self, examples_raw, examples, preds): "Batch the predictions into groups of sequence length" assert len(examples_raw)==len(examples) assert len(examples_raw)==len(preds) ret=[] for i,(sentence,labels) in enumerate(examples_raw): _,_,mask=examples[i] labels_=[l for l,m in zip(preds[i],mask) if m] assert len(labels_)==len(labels) ret.append([sentence,labels,labels_]) return ret def predict_on_batch(self, sess, inputs_batch,mask_batch): feed=self.create_feed_dict(inputs_batch=inputs_batch,mask_batch=mask_batch) predictions=sess.run(tf.argmax(self.pred,axis=2),feed_dict=feed) return predictions def train_on_batch(self, sess, inputs_batch, labels_batch,mask_batch): feed=self.create_feed_dict(inputs_batch,labels_batch=labels_batch,mask_batch=mask_batch, dropout=Config.dropout) _,loss=sess.run([self.train_op,self.loss],feed_dict=feed) return loss def __init__(self,helper,config,pretrained_embedding,report=None): super(RNNModel,self).__init__(helper,config,report) self.max_length=min(Config.max_length,helper.max_length) Config.max_length=self.max_length self.pretrained_embeddings=pretrained_embedding self.input_placeholder=None self.lables_placeholder=None self.mask_placeholder=None self.dropout_placeholder=None self.build()通过python q2_rnn.py test2对实现的代码进行测试
(f) 通过python q2_rnn.py train训练模型:CPU需要两小时,GPU需要10-20分钟
模型和输出将会保存在 results/rnn/<timestamp>/ ,其中<timestamp>是程序运行的日期和时间。文件results.txt包含模型在验证集上预测结果的格式化输出,文件log包含打印输出,比如在训练期间计算的混淆矩阵和F1分数。
最后,可以使用以下方式与模型进行交互:python q2_rnn.py shell -m results/rnn/<timestamp>/
(g) RNN模型的限制
模型在进行预测时无法看到未来。 例如 “New York State University”, 第一个New可能被标记为“LOC”(New York)。biRNN可以解决这个问题。该模型不强制相邻的令牌具有相同的标签(由上例说明)。 在损失中引入成对协议(即使用CRF损失)将解决该问题。
