#### Train a Convolutional Neural Network for MNIST Classification

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torch.utils.tensorboard import SummaryWriter

# Hyperparameters
batch_size = 64  # Number of training samples per batch
learning_rate = 0.01  # Learning rate
num_epochs = 5  # Number of training epochs

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Data preprocessing
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Automatically download MNIST dataset, split into training and validation sets
train_dataset = datasets.MNIST(root='./dataSet', train=True, download=True, transform=transform)
val_dataset = datasets.MNIST(root='./dataSet', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


# Define a simple neural network
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        # First convolutional layer: input channels 1 (grayscale image), output channels 10, kernel 5x5
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        # Second convolutional layer: input channels 10, output channels 20, kernel 3x3
        self.conv2 = nn.Conv2d(10, 20, kernel_size=3)
        # Fully connected layer: input 20*5*5 (feature map size after conv+pooling), output 128
        self.fc1 = nn.Linear(20 * 5 * 5, 128)
        # Output layer: 128 -> 10 (corresponding to 10 digit classes)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        # Input x shape: [batch, 1, 28, 28]
        x = F.max_pool2d(F.relu(self.conv1(x)), 2)  # [batch, 10, 12, 12]
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)  # [batch, 20, 5, 5]
        x = x.view(-1, 20 * 5 * 5)  # Flatten to [batch, 500]
        x = F.relu(self.fc1(x))      # [batch, 128]
        x = self.fc2(x)              # [batch, 10]
        return x


# Instantiate model and move it to GPU (if available)
model = SimpleCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Create TensorBoard SummaryWriter for visualizing model training process
writer = SummaryWriter('./output/runs/mnist_experiment')

# Variable to save the model with highest accuracy
best_val_accuracy = 0.0

# Train model and record loss and accuracy
for epoch in range(num_epochs):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)  # Move data and target to GPU

        # Zero gradients
        optimizer.zero_grad()
        # Forward pass
        output = model(data)
        # Calculate loss
        loss = criterion(output, target)
        # Backward pass
        loss.backward()
        # Update parameters
        optimizer.step()

        # Record training loss to TensorBoard
        if batch_idx % 100 == 0:  # Record every 100 batches
            writer.add_scalar('Loss/train', loss.item(), epoch * len(train_loader) + batch_idx)
            print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}')

    # Validate model and record validation loss and accuracy
    model.eval()
    val_loss = 0
    correct = 0
    with torch.no_grad():  # Do not calculate gradients
        for data, target in val_loader:
            data, target = data.to(device), target.to(device)  # Move data and target to GPU
            output = model(data)
            val_loss += criterion(output, target).item()  # Accumulate validation loss
            pred = output.argmax(dim=1, keepdim=True)  # Get predicted labels
            correct += pred.eq(target.view_as(pred)).sum().item()  # Accumulate correct predictions

    val_loss /= len(val_loader)  # Calculate average validation loss
    val_accuracy = 100. * correct / len(val_loader.dataset)  # Calculate validation accuracy
    print(f'Validation Loss: {val_loss:.4f}, Accuracy: {correct}/{len(val_loader.dataset)} ({val_accuracy:.0f}%)')

    # Record validation loss and accuracy to TensorBoard
    writer.add_scalar('Loss/validation', val_loss, epoch)
    writer.add_scalar('Accuracy/validation', val_accuracy, epoch)

    # Save model with highest validation accuracy
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        torch.save(model.state_dict(), './output/best_model.pth')
        print(f'Model saved with accuracy: {best_val_accuracy:.2f}%')

# Close SummaryWriter
writer.close()
print('Training complete. writer.close()')

#### Use TensorBoard to View Loss Curves

In [None]:
!tensorboard --logdir=/mnt/workspace/output/runs

#### Call the Trained Model to Experience Model Performance


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
import matplotlib.pyplot as plt

# Define the same model structure as during training
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
        self.conv2 = nn.Conv2d(10, 20, kernel_size=3)
        self.fc1 = nn.Linear(20 * 5 * 5, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = F.max_pool2d(F.relu(self.conv1(x)), 2)
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, 20 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load model (note the corrected model class name)
model = SimpleCNN().to(device)
model.load_state_dict(torch.load('output/best_model.pth'))
model.eval()

# Image preprocessing
transform = transforms.Compose([
    transforms.Grayscale(),
    transforms.Resize((28, 28)),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Load test data
test_dataset = datasets.MNIST(root="./dataSet", train=False, transform=transform, download=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=20, shuffle=False)

# Get one batch of data
images, labels = next(iter(test_loader))

# Model prediction
with torch.no_grad():
    images_device = images.to(device)
    outputs = model(images_device)
    _, predicted = torch.max(outputs, 1)
    predicted = predicted.cpu()  # Move prediction results back to CPU

# Create canvas and subplots
fig, axes = plt.subplots(2, 10, figsize=(12, 3.5))
axes = axes.flatten()  # Flatten 2D array to 1D

# Draw each subplot
for i in range(20):
    ax = axes[i]
    # Move image from device back to CPU and remove channel dimension
    ax.imshow(images[i].cpu().squeeze(), cmap="gray")
    # Set title to show true label and prediction result
    ax.set_title(f"label: {labels[i].item()}\npredict: {predicted[i].item()}", fontsize=9)
    ax.axis('off')  # Hide coordinate axes

plt.tight_layout()
plt.show()

#### Copy Model Files to OSS for Persistent Storage


In [8]:
!cp  -r /mnt/workspace/output/  /mnt/data/

#### Download Model Service Web Interface Code and Test Code, Copy to OSS


In [None]:
# Download model prediction web service interface code web.py
!curl -f -o web.py "http://aliyun-document-review.oss-cn-beijing.aliyuncs.com/dsw_files/web.py"

# Download test code for requesting model service
!curl -f -o request_web.py "http://aliyun-document-review.oss-cn-beijing.aliyuncs.com/dsw_files/request_web.py"

# Copy web.py to OSS
!cp /mnt/workspace/web.py /mnt/data/web.py

#### Run web.py in DSW to Verify Code Can Start Normally
##### 1. Installing bottle is necessary because the image environment selected for this article lacks this third-party package. Without installation, you will get "No module named 'bottle'" error.
##### If you want to directly access the web service started in DSW through external public network, please refer to: https://help.aliyun.com/zh/pai/user-guide/custom-services-access-configurations

In [None]:
# Install bottle package
!pip install bottle

# Start web.py service
!python web.py