import numpy as np
    from matplotlib import pyplot as plt

    from keras.utils.np_utils import to_categorical
    from keras.models import Sequential
    from keras.layers.core import Dense, Dropout, Flatten
    from keras.layers import Conv2D
    from keras.callbacks import EarlyStopping
    from keras.datasets import mnist


    (X_train, y_train), (X_test, y_test) = mnist.load_data()

    img_rows, img_cols = X_train[0].shape[0], X_train[0].shape[1]
    X_train = X_train.reshape(X_train.shape[0], img_rows, img_cols, 1)
    X_test = X_test.reshape(X_test.shape[0], img_rows, img_cols, 1)
    input_shape = (img_rows, img_cols, 1)

    X_train = X_train.astype('float32')/255.
    X_test = X_test.astype('float32')/255.

    n_classes = len(set(y_train))
    y_train = to_categorical(y_train, n_classes)
    y_test = to_categorical(y_test, n_classes)

    model = Sequential()
    model.add(Conv2D(64, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
    model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
    model.add(Conv2D(256, kernel_size=(3, 3), activation='relu'))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(n_classes, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    callbacks = [EarlyStopping(monitor='val_acc', patience=5)]

    batch_size = 128
    n_epochs = 200

    model.fit(X_train, y_train, batch_size=batch_size, epochs=n_epochs, verbose=1, validation_split=0.2, callbacks=callbacks)

    score = model.evaluate(X_test, y_test, verbose=0)
    print('Test loss:', score[0])
    print('Test accuracy:', score[1])

    # Extract predictions
    preds = model.predict(X_test)

    n_examples = 10
    plt.figure(figsize=(15, 15))
    for i in range(n_examples):
        ax = plt.subplot(2, n_examples, i + 1)
        plt.imshow(X_test[i, :, :, 0], cmap='gray')
        plt.title("Label: {}\nPredicted: {}".format(np.argmax(y_test[i]), np.argmax(preds[i])))
        plt.axis('off')

    plt.show()

    plt.figure(figsize=(15, 15))

    j=1
    for i in range(len(y_test)):
        if(j>10):
            break
        label = np.argmax(y_test[i])
        pred = np.argmax(preds[i])
        if label != pred:        
            ax = plt.subplot(2, n_examples, j)
            plt.imshow(X_test[i, :, :, 0], cmap='gray')
            plt.title("Label: {}\nPredicted: {}".format(label, pred))
            plt.axis('off')
            j+=1
    plt.show()

Using TensorFlow backend.