本章主要介绍深度学习与python theano。
主要整理来自B站:
1.深度学习框架简介 Theano
2.Theano python 神经网络
大部分时间我们只能求得一个局部最优解

tenforflow鼻祖
ldflags = -lblas
window10安装,这里要说明一点的是python3.8安装theano会出现一些非常奇怪的问题,所以这里选用python3.7.
conda create -n theano_env python=3.7
conda activate theano_env
conda install numpy scipy mkl-service libpython m2w64-toolchain
#如果想要安装的快点,可以使用国内的镜像
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple theano
安装后出现了下列问题:
WARNING (theano.tensor.blas): Failed to import scipy.linalg.blas, and Theano flag blas.ldflags is empty. Falling back on slower implementations for dot(matrix, vector), dot(vector, matrix) and dot(vector, vector) (DLL load failed: 找不到指定的模块。)
不过想了一下,自己也只是学习一下而已,慢就慢吧,不用C的
# View more python tutorials on my Youtube and Youku channel!!!
# Youtube video tutorial: https://www.youtube.com/channel/UCdyjiB5H8Pu7aDTNVXTTpcg
# Youku video tutorial: http://i.youku.com/pythontutorial
# 10 - visualize result
"""
Please note, this code is only for python 3+. If you are using python 2+, please modify the code accordingly.
"""
from __future__ import print_function
import theano
import theano.tensor as T
import numpy as np
import matplotlib.pyplot as plt
class Layer(object):
def __init__(self, inputs, in_size, out_size, activation_function=None):
self.W = theano.shared(np.random.normal(0, 1, (in_size, out_size)))
self.b = theano.shared(np.zeros((out_size, )) + 0.1)
self.Wx_plus_b = T.dot(inputs, self.W) + self.b
self.activation_function = activation_function
if activation_function is None:
self.outputs = self.Wx_plus_b
else:
self.outputs = self.activation_function(self.Wx_plus_b)
# Make up some fake data
#把行变为列向量
x_data = np.linspace(-1, 1, 300)[:, np.newaxis]
noise = np.random.normal(0, 0.05, x_data.shape)
y_data = np.square(x_data) - 0.5 + noise # y = x^2 - 0.5
# show the fake data
plt.scatter(x_data, y_data)
plt.show()
# determine the inputs dtype
x = T.dmatrix("x")
y = T.dmatrix("y")
# add layers
l1 = Layer(x, 1, 10, T.nnet.relu)
l2 = Layer(l1.outputs, 10, 1, None)
# compute the cost
cost = T.mean(T.square(l2.outputs - y))#l2^2-y
# compute the gradients
gW1, gb1, gW2, gb2 = T.grad(cost, [l1.W, l1.b, l2.W, l2.b])
# apply gradient descent
learning_rate = 0.05
train = theano.function(
inputs=[x, y],
outputs=[cost],
updates=[(l1.W, l1.W - learning_rate * gW1),
(l1.b, l1.b - learning_rate * gb1),
(l2.W, l2.W - learning_rate * gW2),
(l2.b, l2.b - learning_rate * gb2)])
# prediction
predict = theano.function(inputs=[x], outputs=l2.outputs)
# plot the real data
fig = plt.figure()
ax = fig.add_subplot(1,1,1)
ax.scatter(x_data, y_data)
plt.ion()
plt.show()
for i in range(1000):
# training
err = train(x_data, y_data)
if i % 50 == 0:
# to visualize the result and improvement
try:
ax.lines.remove(lines[0])
except Exception:
pass
prediction_value = predict(x_data)
# plot the prediction
lines = ax.plot(x_data, prediction_value, 'r-', lw=5)
plt.pause(.5)
这里有一个比较特别的用法,生成二项式分布
import numpy as np
import theano
import theano.tensor as T
from theano.tensor.nnet import conv
from theano.tensor.nnet import softmax
from theano.tensor import shared_randomstreams
x=T.matrix('x')
y =T.fscalars('y')
theano.config.floatX='float32'
def dropout_layer(x,y):
srng=shared_randomstreams.RandomStreams(np.random.RandomState(0).randint(999))
mask=srng.binomial(n=1,p=1-y,size=x.shape)
return x*T.cast(mask,theano.config.floatX)
a=np.random.randn(2,3)
print(dropout_layer(a,0.6))
上面的代码会出错
我们要改动一下
import numpy as np
import theano
import theano.tensor as T
from theano.tensor.nnet import conv
from theano.tensor.nnet import softmax
from theano.tensor import shared_randomstreams
x=T.matrix('x')
y =T.fscalars('y')
theano.config.floatX='float32'
# def dropout_layer(x,y):
# srng=shared_randomstreams.RandomStreams(np.random.RandomState(0).randint(999))
# mask=srng.binomial(n=1,p=1-y,size=x.shape)
# return x*T.cast(mask,theano.config.floatX)
a=np.random.randn(2,3)
# 生成一个随机数生成对象
srng=shared_randomstreams.RandomStreams(np.random.RandomState(0).randint(999))
mask=srng.binomial(n=1,p=1-y,size=x.shape)
d=x*T.cast(mask,theano.config.floatX)
fun=theano.function([x,y],d)
print(fun(a,0.6))

上述代码就可以正常运行了,定义fun的时候,它会像makefile一样逐个解析依赖,比如d中有mask,那么接着去查找mask,发现mask中用到了y,接着回去查找y的值,最终就可以执行结束了。
import numpy as np
import theano
import theano.tensor as T
from theano.tensor.nnet import conv
from theano.tensor.nnet import softmax
from theano.tensor import shared_randomstreams
x=T.matrix('x')
y =T.ivector('y')
theano.config.floatX='float32'
#最大对数似然函数
cost=[-T.log(x)[T.arange(y.shape[0]),y]]
fun=theano.function([x,y],cost)
a=np.array([[0.8,0.1,0.1],[0.1,0.1,0.8]])
b=np.array([0,2])
result=fun(a,b)
print(result)
输出结果是:
[array([0.22314355, 0.22314355])]
以前对最大对数似然函数了解不深,不知道为什么是最大对数似然函数,今天写代码看到这里终于明白了,就是求vector中最大的那个概率值的对数,把这个结果的负值作为损失函数。
比如下面这行代码
cost=[-T.log(x)[T.arange(y.shape[0]),y]]
其中x,y
a=np.array([[0.8,0.1,0.1],[0.1,0.1,0.8]])
b=np.array([0,2])
如果单独计算-T.log(x)那么结果就是一个size是(2,3)矩阵:
[[-log(0.8),-log(0.1),-log(0.1)],
[-log(0.1),-log(0.1),-log(0.8)]]
但是后面跟了一个-T.log(x)[T.arange(y.shape[0]),y],这是什么意思呢?
就是说我只计算a队列中的[0,1],[1,2],中的两个最大的对数[log(0.8),log(0.8)]
所以最终结果是:
[array([0.22314355, 0.22314355])]
计算整个向量中最大的对数,所以称之为最大对数似然函数。
"""network3.py
~~~~~~~~~~~~~~
A Theano-based program for training and running simple neural
networks.
Supports several layer types (fully connected, convolutional, max
pooling, softmax), and activation functions (sigmoid, tanh, and
rectified linear units, with more easily added).
When run on a CPU, this program is much faster than network.py and
network2.py. However, unlike network.py and network2.py it can also
be run on a GPU, which makes it faster still.
Because the code is based on Theano, the code is different in many
ways from network.py and network2.py. However, where possible I have
tried to maintain consistency with the earlier programs. In
particular, the API is similar to network2.py. Note that I have
focused on making the code simple, easily readable, and easily
modifiable. It is not optimized, and omits many desirable features.
This program incorporates ideas from the Theano documentation on
convolutional neural nets (notably,
http://deeplearning.net/tutorial/lenet.html ), from Misha Denil's
implementation of dropout (https://github.com/mdenil/dropout ), and
from Chris Olah (http://colah.github.io ).
Written for Theano 0.6 and 0.7, needs some changes for more recent
versions of Theano.
"""
#### Libraries
# Standard library
import pickle
import gzip
# Third-party libraries
import numpy as np
import theano
import theano.tensor as T
from theano.tensor.nnet import conv
from theano.tensor.nnet import softmax
from theano.tensor import shared_randomstreams
from theano.tensor.signal import pool
# Activation functions for neurons
def linear(z): return z
def ReLU(z): return T.maximum(0.0, z)
from theano.tensor.nnet import sigmoid
from theano.tensor import tanh
#### Constants
GPU = False
if GPU:
print ("Trying to run under a GPU. If this is not desired, then modify network3.py\nto set the GPU flag to False.")
try: theano.config.device = 'gpu'
except: pass # it's already set
theano.config.floatX = 'float32'
else:
print ("Running with a CPU. If this is not desired, then the modify network3.py to set\nthe GPU flag to True.")
#### Load the MNIST data
def load_data_shared(filename="../data/mnist.pkl.gz"):
f = gzip.open(filename, 'rb')
training_data, validation_data, test_data = pickle.load(f,encoding='bytes')
f.close()
def shared(data):
"""
Place the data into shared variables. This allows Theano to copy
the data to the GPU, if one is available.
"""
shared_x = theano.shared(
np.asarray(data[0], dtype=theano.config.floatX), borrow=True)
shared_y = theano.shared(
np.asarray(data[1], dtype=theano.config.floatX), borrow=True)
return shared_x, T.cast(shared_y, "int32")
return [shared(training_data), shared(validation_data), shared(test_data)]
#### Main class used to construct and train networks
class Network(object):
def __init__(self, layers, mini_batch_size):
"""
Takes a list of `layers`, describing the network architecture, and
a value for the `mini_batch_size` to be used during training
by stochastic gradient descent.
"""
self.layers = layers
self.mini_batch_size = mini_batch_size
self.params = [param for layer in self.layers for param in layer.params]
self.x = T.matrix("x")
self.y = T.ivector("y")
init_layer = self.layers[0]
init_layer.set_inpt(self.x, self.x, self.mini_batch_size)
for j in range(1, len(self.layers)):
prev_layer, layer = self.layers[j-1], self.layers[j]
layer.set_inpt(prev_layer.output, prev_layer.output_dropout, self.mini_batch_size)
self.output = self.layers[-1].output
self.output_dropout = self.layers[-1].output_dropout
def SGD(self, training_data, epochs, mini_batch_size, eta,
validation_data, test_data, lmbda=0.0):
"""Train the network using mini-batch stochastic gradient descent."""
training_x, training_y = training_data
validation_x, validation_y = validation_data
test_x, test_y = test_data
# compute number of minibatches for training, validation and testing
num_training_batches = size(training_data)/mini_batch_size
num_validation_batches = size(validation_data)/mini_batch_size
num_test_batches = size(test_data)/mini_batch_size
# define the (regularized) cost function, symbolic gradients, and updates
l2_norm_squared = sum([(layer.w**2).sum() for layer in self.layers])
cost = self.layers[-1].cost(self)+ 0.5*lmbda*l2_norm_squared/num_training_batches
grads = T.grad(cost, self.params)
updates = [(param, param-eta*grad) for param, grad in zip(self.params, grads)]
# define functions to train a mini-batch, and to compute the
# accuracy in validation and test mini-batches.
i = T.lscalar() # mini-batch index
train_mb = theano.function(
[i], cost, updates=updates,
givens={
self.x:
training_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],
self.y:
training_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
})
validate_mb_accuracy = theano.function(
[i], self.layers[-1].accuracy(self.y),
givens={
self.x:
validation_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],
self.y:
validation_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
})
test_mb_accuracy = theano.function(
[i], self.layers[-1].accuracy(self.y),
givens={
self.x:
test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size],
self.y:
test_y[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
})
self.test_mb_predictions = theano.function(
[i], self.layers[-1].y_out,
givens={
self.x:
test_x[i*self.mini_batch_size: (i+1)*self.mini_batch_size]
})
# Do the actual training
best_validation_accuracy = 0.0
for epoch in range(epochs):
for minibatch_index in np.arange(num_training_batches):
iteration = num_training_batches*epoch+minibatch_index
if iteration % 1000 == 0:
print("Training mini-batch number {0}".format(iteration))
cost_ij = train_mb(minibatch_index)
if (iteration+1) % num_training_batches == 0:
validation_accuracy = np.mean(
[validate_mb_accuracy(j) for j in np.arange(num_validation_batches)])
print("Epoch {0}: validation accuracy {1:.2%}".format(
epoch, validation_accuracy))
if validation_accuracy >= best_validation_accuracy:
print("This is the best validation accuracy to date.")
best_validation_accuracy = validation_accuracy
best_iteration = iteration
if test_data:
test_accuracy = np.mean(
[test_mb_accuracy(j) for j in np.arange(num_test_batches)])
print('The corresponding test accuracy is {0:.2%}'.format(
test_accuracy))
print("Finished training network.")
print("Best validation accuracy of {0:.2%} obtained at iteration {1}".format(
best_validation_accuracy, best_iteration))
print("Corresponding test accuracy of {0:.2%}".format(test_accuracy))
#### Define layer types
class ConvPoolLayer(object):
"""Used to create a combination of a convolutional and a max-pooling
layer. A more sophisticated implementation would separate the
two, but for our purposes we'll always use them together, and it
simplifies the code, so it makes sense to combine them.
"""
def __init__(self, filter_shape, image_shape, poolsize=(2, 2),
activation_fn=sigmoid):
"""`filter_shape` is a tuple of length 4, whose entries are the number
of filters, the number of input feature maps, the filter height, and the
filter width.
`image_shape` is a tuple of length 4, whose entries are the
mini-batch size, the number of input feature maps, the image
height, and the image width.
`poolsize` is a tuple of length 2, whose entries are the y and
x pooling sizes.
"""
self.filter_shape = filter_shape
self.image_shape = image_shape
self.poolsize = poolsize
self.activation_fn=activation_fn
# initialize weights and biases
n_out = (filter_shape[0]*np.prod(filter_shape[2:])/np.prod(poolsize))
self.w = theano.shared(
np.asarray(
np.random.normal(loc=0, scale=np.sqrt(1.0/n_out), size=filter_shape),
dtype=theano.config.floatX),
borrow=True)
self.b = theano.shared(
np.asarray(
np.random.normal(loc=0, scale=1.0, size=(filter_shape[0],)),
dtype=theano.config.floatX),
borrow=True)
self.params = [self.w, self.b]
def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
self.inpt = inpt.reshape(self.image_shape)
conv_out = conv.conv2d(
input=self.inpt, filters=self.w, filter_shape=self.filter_shape,
image_shape=self.image_shape)
pooled_out = pool.pool_2d(
input=conv_out, ds=self.poolsize, ignore_border=True)
self.output = self.activation_fn(
pooled_out + self.b.dimshuffle('x', 0, 'x', 'x'))
self.output_dropout = self.output # no dropout in the convolutional layers
class FullyConnectedLayer(object):
def __init__(self, n_in, n_out, activation_fn=sigmoid, p_dropout=0.0):
self.n_in = n_in
self.n_out = n_out
self.activation_fn = activation_fn
self.p_dropout = p_dropout
# Initialize weights and biases
self.w = theano.shared(
np.asarray(
np.random.normal(
loc=0.0, scale=np.sqrt(1.0/n_out), size=(n_in, n_out)),
dtype=theano.config.floatX),
name='w', borrow=True)
self.b = theano.shared(
np.asarray(np.random.normal(loc=0.0, scale=1.0, size=(n_out,)),
dtype=theano.config.floatX),
name='b', borrow=True)
self.params = [self.w, self.b]
def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
self.inpt = inpt.reshape((mini_batch_size, self.n_in))
self.output = self.activation_fn(
(1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b)
self.y_out = T.argmax(self.output, axis=1)
self.inpt_dropout = dropout_layer(
inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout)
self.output_dropout = self.activation_fn(
T.dot(self.inpt_dropout, self.w) + self.b)
def accuracy(self, y):
"Return the accuracy for the mini-batch."
return T.mean(T.eq(y, self.y_out))
class SoftmaxLayer(object):
def __init__(self, n_in, n_out, p_dropout=0.0):
self.n_in = n_in
self.n_out = n_out
self.p_dropout = p_dropout
# Initialize weights and biases
self.w = theano.shared(
np.zeros((n_in, n_out), dtype=theano.config.floatX),
name='w', borrow=True)
self.b = theano.shared(
np.zeros((n_out,), dtype=theano.config.floatX),
name='b', borrow=True)
self.params = [self.w, self.b]
def set_inpt(self, inpt, inpt_dropout, mini_batch_size):
self.inpt = inpt.reshape((mini_batch_size, self.n_in))
self.output = softmax((1-self.p_dropout)*T.dot(self.inpt, self.w) + self.b)
self.y_out = T.argmax(self.output, axis=1)
self.inpt_dropout = dropout_layer(
inpt_dropout.reshape((mini_batch_size, self.n_in)), self.p_dropout)
self.output_dropout = softmax(T.dot(self.inpt_dropout, self.w) + self.b)
def cost(self, net):
"Return the log-likelihood cost."
return -T.mean(T.log(self.output_dropout)[T.arange(net.y.shape[0]), net.y])
def accuracy(self, y):
"Return the accuracy for the mini-batch."
return T.mean(T.eq(y, self.y_out))
#### Miscellanea
def size(data):
"Return the size of the dataset `data`."
return data[0].get_value(borrow=True).shape[0]
def dropout_layer(layer, p_dropout):
srng = shared_randomstreams.RandomStreams(
np.random.RandomState(0).randint(999999))
mask = srng.binomial(n=1, p=1-p_dropout, size=layer.shape)
return layer*T.cast(mask, theano.config.floatX)
if __name__=="__main__":
mini_batch_size=10
data_name='F:\\work\\deep_learning\\mnist.pkl.gz'
data=load_data_shared(data_name)
training_data=data[0]
validation_data=data[1]
test_data=data[2]
net=Network([
FullyConnectedLayer(n_in=784,n_out=100),
SoftmaxLayer(n_in=100,n_out=10)],mini_batch_size)
net.SGD(training_data, 30,mini_batch_size,0.1,validation_data,test_data)
我们把上面的main函数换一下:
if __name__=="__main__":
mini_batch_size=10
data_name='F:\\work\\deep_learning\\mnist.pkl.gz'
data=load_data_shared(data_name)
training_data=data[0]
validation_data=data[1]
test_data=data[2]
net=Network([
ConvPoolLayer(image_shape=(mini_batch_size,1,28,28),filter_shape=(20,1,5,5),poolsize=(2,2)),
ConvPoolLayer(image_shape=(mini_batch_size,20,12,12),filter_shape=(40,20,5,5),poolsize=(2,2)),
FullyConnectedLayer(n_in=40*4*4,n_out=100),SoftmaxLayer(n_in=100,n_out=10)],mini_batch_size)
net.SGD(training_data, 30,mini_batch_size,0.1,validation_data,test_data)
在