Data Process
1.after checking the data file,I found that the keyword and location column is NaN.Therefore,I fill “unkown” to replace NaN.
data_train['keyword'] = data_train['keyword'].fillna('uknown_keyword')
data_train['location'] = data_train['location'].fillna('uknown_location')
2.I use Pandas to combine the keyword,location,and text columns of each row in the dataset into a single string,and stores these strings in a list.
corpus = data_train.apply(lambda row:f"keyword:{row['keyword']} | location:{row['location']} | text: {row['text']}",axis=1).tolist()
3.I use BertTokenizer to tokenize and preprocessing the list.
max_len = 0
for sent in corpus:
    # 将文本分词,并添加 `[CLS]` 和 `[SEP]` 符号
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))
    
print('Max sentence length: ', max_len)
max_len = 0
for sent in corpuss:
    input_ids = tokenizer.encode(sent, add_special_tokens=True)
    max_len = max(max_len, len(input_ids))
    
print('Max sentence length: ', max_len)
input_ids = []
attention_masks = []
for sent in corpus:
    X_bert = tokenizer.encode_plus(
        sent,
        add_special_tokens = True,
        max_length = 128,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt',
    )
    input_ids.append(X_bert['input_ids'])
    attention_masks.append(X_bert['attention_mask'])
input_ids = torch.cat(input_ids,dim=0)
attention_masks = torch.cat(attention_masks,dim=0)
labels = torch.tensor(y, dtype=torch.long)
print('Original: ', corpus[0])
print('Token IDs:', input_ids[0])
input_idss = []
attention_maskss = []
for sent in corpuss:
    encoded_dicts = tokenizer.encode_plus(
                        sent,                      
                        add_special_tokens = True, 
                        max_length = 128,           
                        pad_to_max_length = True,
                        return_attention_mask = True,   
                        return_tensors = 'pt',     
                   )
    input_idss.append(encoded_dicts['input_ids'])
    attention_maskss.append(encoded_dicts['attention_mask'])
input_idss = torch.cat(input_idss, dim=0)
attention_maskss = torch.cat(attention_maskss, dim=0)
4.I use pytorch Dataset to transform the tokenized corpus and corpuss to a dataset used in BERT.
from torch.utils.data import TensorDataset
dataset = TensorDataset(input_ids, attention_masks, labels)
datasets = TensorDataset(input_idss, attention_maskss)#dataset for test doesn't include labels
Model Train
I choose BERT and set
optimizer = AdamW(model.parameters(),
                 lr = 2e-5,
                 eps = 1e-8
                 )
from transformers import get_linear_schedule_with_warmup
epochs = 2
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps = 0,
                                           num_training_steps = total_steps)
Then,I conduct training in small batches
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
# 存储训练和评估的 loss、准确率、训练时长等统计指标, 
training_stats = []
total_t0 = time.time()
for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # 统计单次 epoch 的训练时间
    t0 = time.time()
    # 重置每次 epoch 的训练总 loss
    total_train_loss = 0
    # 将模型设置为训练模式。这里并不是调用训练接口的意思
    model.train()
    # 训练集小批量迭代
    for step, batch in enumerate(train_dataloader):
        # 每经过40次迭代,就输出进度信息
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        # print(b_input_ids.shape)  # 应为 (batch_size, max_length)
        # print(b_input_mask.shape) # 同上
        # print(b_labels.shape)     # 应为 (batch_size,)
        model.zero_grad()#每次计算梯度前将其清零,因为pytorch梯度是累加的
    
        #前向
        # loss, logits = model(b_input_ids, 
        #                          token_type_ids=None, 
        #                          attention_mask=b_input_mask, 
        #                          labels=b_labels)
    
        outputs = model(
            input_ids=b_input_ids,
            attention_mask=b_input_mask,
            labels=b_labels  # 确保传递了 labels
        )
        
        # 从 outputs 中提取 loss 和 logits
        loss = outputs.loss  # 直接访问 loss 属性
        logits = outputs.logits
    
        print("模型返回的 loss:", loss.item())
    
        # 累加 loss
        total_train_loss += loss.item()
    
        # 反向传播
        loss.backward()
    
        # 梯度裁剪,避免出现梯度爆炸情况
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    
        # 更新参数
        optimizer.step()
    
        # 更新学习率
        scheduler.step()
    # 平均训练误差
    avg_train_loss = total_train_loss / len(train_dataloader)            
    
    # 单次 epoch 的训练时长
    training_time = format_time(time.time() - t0)
Model Prediction
I use model.eval to get the prediction result
model.eval()
# Tracking variables 
predictions , true_labels = [], []
# 预测
for batch in test_dataloader:
  # 将数据加载到 gpu 中
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask = batch
  
  # 不需要计算梯度
  with torch.no_grad():
      # 前向传播,获取预测结果
      outputs = model(b_input_ids,attention_mask=b_input_mask)
  logits = outputs.logits
  # 将结果加载到 cpu 中
  batch_logits = logits.detach().cpu().numpy()
  
  # 存储预测结果和 labels
  predictions.append(batch_logits)
flat_predictions = np.concatenate(predictions, axis=0)
predicted_labels = np.argmax(flat_predictions, axis=1)
Why?
This chanllenge is a classification problem,and BERT has these suitable features:
1.Superior Understanding of Context from Both Directions;
2.Effectiveness in Pre-training and Fine-tuning;What we need to do is to add an untrained neuronal layer at the end, and then train a new model to complete our classification task;
3.Deep Contextual Analysis;
4.Versatility Across Multiple Tasks
Limitation
Long Training Time:I only trained for 2 epochs,but it took 10941.0s.
TO DO
1.Increase the train epochs;
2.Try other models or methods;
3.Add error blog.