Data Process
1.after checking the data file,I found that the keyword and location column is NaN.Therefore,I fill “unkown” to replace NaN.
data_train['keyword'] = data_train['keyword'].fillna('uknown_keyword')
data_train['location'] = data_train['location'].fillna('uknown_location')
2.I use Pandas to combine the keyword,location,and text columns of each row in the dataset into a single string,and stores these strings in a list.
corpus = data_train.apply(lambda row:f"keyword:{row['keyword']} | location:{row['location']} | text: {row['text']}",axis=1).tolist()
3.I use BertTokenizer to tokenize and preprocessing the list.
max_len = 0
for sent in corpus:
# 将文本分词,并添加 `[CLS]` 和 `[SEP]` 符号
input_ids = tokenizer.encode(sent, add_special_tokens=True)
max_len = max(max_len, len(input_ids))
print('Max sentence length: ', max_len)
max_len = 0
for sent in corpuss:
input_ids = tokenizer.encode(sent, add_special_tokens=True)
max_len = max(max_len, len(input_ids))
print('Max sentence length: ', max_len)
input_ids = []
attention_masks = []
for sent in corpus:
X_bert = tokenizer.encode_plus(
sent,
add_special_tokens = True,
max_length = 128,
pad_to_max_length = True,
return_attention_mask = True,
return_tensors = 'pt',
)
input_ids.append(X_bert['input_ids'])
attention_masks.append(X_bert['attention_mask'])
input_ids = torch.cat(input_ids,dim=0)
attention_masks = torch.cat(attention_masks,dim=0)
labels = torch.tensor(y, dtype=torch.long)
print('Original: ', corpus[0])
print('Token IDs:', input_ids[0])
input_idss = []
attention_maskss = []
for sent in corpuss:
encoded_dicts = tokenizer.encode_plus(
sent,
add_special_tokens = True,
max_length = 128,
pad_to_max_length = True,
return_attention_mask = True,
return_tensors = 'pt',
)
input_idss.append(encoded_dicts['input_ids'])
attention_maskss.append(encoded_dicts['attention_mask'])
input_idss = torch.cat(input_idss, dim=0)
attention_maskss = torch.cat(attention_maskss, dim=0)
4.I use pytorch Dataset to transform the tokenized corpus and corpuss to a dataset used in BERT.
from torch.utils.data import TensorDataset
dataset = TensorDataset(input_ids, attention_masks, labels)
datasets = TensorDataset(input_idss, attention_maskss)#dataset for test doesn't include labels
Model Train
I choose BERT and set
optimizer = AdamW(model.parameters(),
lr = 2e-5,
eps = 1e-8
)
from transformers import get_linear_schedule_with_warmup
epochs = 2
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0,
num_training_steps = total_steps)
Then,I conduct training in small batches
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
# 存储训练和评估的 loss、准确率、训练时长等统计指标,
training_stats = []
total_t0 = time.time()
for epoch_i in range(0, epochs):
print("")
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
print('Training...')
# 统计单次 epoch 的训练时间
t0 = time.time()
# 重置每次 epoch 的训练总 loss
total_train_loss = 0
# 将模型设置为训练模式。这里并不是调用训练接口的意思
model.train()
# 训练集小批量迭代
for step, batch in enumerate(train_dataloader):
# 每经过40次迭代,就输出进度信息
if step % 40 == 0 and not step == 0:
elapsed = format_time(time.time() - t0)
print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
# print(b_input_ids.shape) # 应为 (batch_size, max_length)
# print(b_input_mask.shape) # 同上
# print(b_labels.shape) # 应为 (batch_size,)
model.zero_grad()#每次计算梯度前将其清零,因为pytorch梯度是累加的
#前向
# loss, logits = model(b_input_ids,
# token_type_ids=None,
# attention_mask=b_input_mask,
# labels=b_labels)
outputs = model(
input_ids=b_input_ids,
attention_mask=b_input_mask,
labels=b_labels # 确保传递了 labels
)
# 从 outputs 中提取 loss 和 logits
loss = outputs.loss # 直接访问 loss 属性
logits = outputs.logits
print("模型返回的 loss:", loss.item())
# 累加 loss
total_train_loss += loss.item()
# 反向传播
loss.backward()
# 梯度裁剪,避免出现梯度爆炸情况
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# 更新参数
optimizer.step()
# 更新学习率
scheduler.step()
# 平均训练误差
avg_train_loss = total_train_loss / len(train_dataloader)
# 单次 epoch 的训练时长
training_time = format_time(time.time() - t0)
Model Prediction
I use model.eval to get the prediction result
model.eval()
# Tracking variables
predictions , true_labels = [], []
# 预测
for batch in test_dataloader:
# 将数据加载到 gpu 中
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask = batch
# 不需要计算梯度
with torch.no_grad():
# 前向传播,获取预测结果
outputs = model(b_input_ids,attention_mask=b_input_mask)
logits = outputs.logits
# 将结果加载到 cpu 中
batch_logits = logits.detach().cpu().numpy()
# 存储预测结果和 labels
predictions.append(batch_logits)
flat_predictions = np.concatenate(predictions, axis=0)
predicted_labels = np.argmax(flat_predictions, axis=1)
Why?
This chanllenge is a classification problem,and BERT has these suitable features:
1.Superior Understanding of Context from Both Directions;
2.Effectiveness in Pre-training and Fine-tuning;What we need to do is to add an untrained neuronal layer at the end, and then train a new model to complete our classification task;
3.Deep Contextual Analysis;
4.Versatility Across Multiple Tasks
Limitation
Long Training Time:I only trained for 2 epochs,but it took 10941.0s.
TO DO
1.Increase the train epochs;
2.Try other models or methods;
3.Add error blog.