Text-Classification-Pytorch icon indicating copy to clipboard operation
Text-Classification-Pytorch copied to clipboard

Bert embeddings

Open rajae-Bens opened this issue 3 years ago • 0 comments

Hi

I am trying to use the models u implemented with bert embedding for Arabic language but I am getting very low accuracy. So I am wondering if I am doing things wrong especially that I al newby to deep learning here is my modification to the attention model

            class SelfAttention(nn.Module):
        def __init__(self, batch_size, output_size, hidden_size, bert):
	super(SelfAttention, self).__init__()

	"""
	Arguments
	---------
	batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
	output_size : 2 = (pos, neg)
	hidden_sie : Size of the hidden_state of the LSTM
	vocab_size : Size of the vocabulary containing unique words
	embedding_length : Embeddding dimension of GloVe word embeddings
	weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 
	
	--------
	
	"""

	self.batch_size = batch_size
	self.output_size = output_size
	self.hidden_size = hidden_size
	#self.vocab_size = vocab_size
	self.bert = bert
	
	embedding_length=bert.config.to_dict()['hidden_size']
	print(embedding_length)
	#self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
	#self.word_embeddings.weights = nn.Parameter(weights, requires_grad=False)
	self.dropout = 0.8
	self.bilstm = nn.LSTM(embedding_length, hidden_size, dropout=self.dropout, bidirectional=True)
	# We will use da = 350, r = 30 & penalization_coeff = 1 as per given in the self-attention original ICLR paper
	self.W_s1 = nn.Linear(2*hidden_size, 350)
	self.W_s2 = nn.Linear(350, 30)
	self.fc_layer = nn.Linear(30*2*hidden_size, 2000)
	self.label = nn.Linear(2000, output_size)

def attention_net(self, lstm_output):

	"""
	Now we will use self attention mechanism to produce a matrix embedding of the input sentence in which every row represents an
	encoding of the inout sentence but giving an attention to a specific part of the sentence. We will use 30 such embedding of 
	the input sentence and then finally we will concatenate all the 30 sentence embedding vectors and connect it to a fully 
	connected layer of size 2000 which will be connected to the output layer of size 2 returning logits for our two classes i.e., 
	pos & neg.
	Arguments
	---------
	lstm_output = A tensor containing hidden states corresponding to each time step of the LSTM network.
	---------
	Returns : Final Attention weight matrix for all the 30 different sentence embedding in which each of 30 embeddings give
			  attention to different parts of the input sentence.
	Tensor size : lstm_output.size() = (batch_size, num_seq, 2*hidden_size)
				  attn_weight_matrix.size() = (batch_size, 30, num_seq)
	"""
	attn_weight_matrix = self.W_s2(F.tanh(self.W_s1(lstm_output)))
	attn_weight_matrix = attn_weight_matrix.permute(0, 2, 1)
	attn_weight_matrix = F.softmax(attn_weight_matrix, dim=2)

	return attn_weight_matrix

def forward(self, input_sentences, batch_size=None):
	
	""" 
	Parameters
	----------
	input_sentence: input_sentence of shape = (batch_size, num_sequences)
	batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)
	
	Returns
	-------
	Output of the linear layer containing logits for pos & neg class.
	
	"""
	with torch.no_grad():
		input = self.bert(input_sentences)[0]

	input = input.permute(1, 0, 2)
	if batch_size is None:
		h_0 = Variable(torch.zeros(2, self.batch_size, self.hidden_size).cuda())
		c_0 = Variable(torch.zeros(2, self.batch_size, self.hidden_size).cuda())
	else:
		h_0 = Variable(torch.zeros(2, batch_size, self.hidden_size).cuda())
		c_0 = Variable(torch.zeros(2, batch_size, self.hidden_size).cuda())

	output, (h_n, c_n) = self.bilstm(input, (h_0, c_0))
	output = output.permute(1, 0, 2)
	# output.size() = (batch_size, num_seq, 2*hidden_size)
	# h_n.size() = (1, batch_size, hidden_size)
	# c_n.size() = (1, batch_size, hidden_size)
	attn_weight_matrix = self.attention_net(output)
	# attn_weight_matrix.size() = (batch_size, r, num_seq)
	# output.size() = (batch_size, num_seq, 2*hidden_size)
	hidden_matrix = torch.bmm(attn_weight_matrix, output)
	# hidden_matrix.size() = (batch_size, r, 2*hidden_size)
	# Let's now concatenate the hidden_matrix and connect it to the fully connected layer.
	fc_out = self.fc_layer(hidden_matrix.view(-1, hidden_matrix.size()[1]*hidden_matrix.size()[2]))
	logits = self.label(fc_out)
	# logits.size() = (batch_size, output_size)

	return logits

could u help please

Thanks

rajae-Bens avatar Aug 24 '20 10:08 rajae-Bens