In [1]:

#!/usr/bin/env python

SAVE_PARAMS_EVERY = 5000

import glob
import random
import numpy as np
import os.path as op
import pickle


def load_saved_params():
    """
    A helper function that loads previously saved parameters and resets
    iter_ation start.
    """
    st = 0
    for f in glob.glob("saved_params_*.npy"):
        iter_ = int(op.splitext(op.basename(f))[0].split("_")[2])
        if (iter_ > st):
            st = iter_

    if st > 0:
        print(st)
        with open("saved_params_%d.npy" % st, "rb") as f:
            params = pickle.load(f)
            state = pickle.load(f)
        return st, params, state
    else:
        return st, None, None


def save_params(iter_, params):
    with open("saved_params_%d.npy" % iter_, "wb") as f:
        pickle.dump(params, f)
        pickle.dump(random.getstate(), f)

In [2]:

def sgd(f, x0, step, iterations, postprocessing=None, useSaved=False,
        PRINT_EVERY=10):
    """ Stochastic Gradient Descent

    Implement the stochastic gradient descent method in this function.

    Arguments:
    f -- the function to optimize, it should take a single
         argument and yield two outputs, a cost and the gradient
         with respect to the arguments
    x0 -- the initial point to start SGD from
    step -- the step size for SGD
    iter_ations -- total iter_ations to run SGD for
    postprocessing -- postprocessing function for the parameters
                      if necessary. In the case of word2vec we will need to
                      normalize the word vectors to have unit length.
    PRINT_EVERY -- specifies how many iter_ations to output loss

    Return:
    x -- the parameter value after SGD finishes
    """

    # Anneal learning rate every several iter_ations
    ANNEAL_EVERY = 20000

    if useSaved:
        start_iter_, oldx, state = load_saved_params()
        if start_iter_ > 0:
            x0 = oldx
            step *= 0.5 ** (start_iter_ / ANNEAL_EVERY)

        if state:
            random.setstate(state)
    else:
        start_iter_ = 0

    x = x0

    if not postprocessing:
        postprocessing = lambda x: x

    expcost = None

    for iter_ in range(start_iter_ + 1, iterations + 1):
        # Don't forget to apply the postprocessing after every iteration!
        # You might want to print the progress every few iterations.

        cost = None
        
        ### YOUR CODE HERE
        # mini-batch已经写好了，学习率递减也写好了，直接算梯度乘以学习率即可
        cost, grad = f(x)
        x -= step * grad
        x = postprocessing(x)
        ### END YOUR CODE

        if iter_ % PRINT_EVERY == 0:
            if not expcost:
                expcost = cost
            else:
                expcost = .95 * expcost + .05 * cost
                
            print("iter_ %d: %f" % (iter_, expcost))
        try:
            if iter_ % SAVE_PARAMS_EVERY == 0 and useSaved:
                save_params(iter_, x)
                print("saved!")

            if iter_ % ANNEAL_EVERY == 0:
                step *= 0.5
        except Exception as e:
            print(str(e))
    return x


def sanity_check():
    quad = lambda x: (np.sum(x ** 2), x * 2)

    print("Running sanity checks...")
    t1 = sgd(quad, 0.5, 0.01, 1000, PRINT_EVERY=100)
    print("test 1 result:", t1)
    assert abs(t1) <= 1e-6

    t2 = sgd(quad, 0.0, 0.01, 1000, PRINT_EVERY=100)
    print("test 2 result:", t2)
    assert abs(t2) <= 1e-6

    t3 = sgd(quad, -1.5, 0.01, 1000, PRINT_EVERY=100)
    print("test 3 result:", t3)
    assert abs(t3) <= 1e-6

    print("")



if __name__ == "__main__":
    sanity_check()
    # your_sanity_checks()

Running sanity checks...
iter_ 100: 0.004578
iter_ 200: 0.004353
iter_ 300: 0.004136
iter_ 400: 0.003929
iter_ 500: 0.003733
iter_ 600: 0.003546
iter_ 700: 0.003369
iter_ 800: 0.003200
iter_ 900: 0.003040
iter_ 1000: 0.002888
test 1 result: 8.414836786079764e-10
iter_ 100: 0.000000
iter_ 200: 0.000000
iter_ 300: 0.000000
iter_ 400: 0.000000
iter_ 500: 0.000000
iter_ 600: 0.000000
iter_ 700: 0.000000
iter_ 800: 0.000000
iter_ 900: 0.000000
iter_ 1000: 0.000000
test 2 result: 0.0
iter_ 100: 0.041205
iter_ 200: 0.039181
iter_ 300: 0.037222
iter_ 400: 0.035361
iter_ 500: 0.033593
iter_ 600: 0.031913
iter_ 700: 0.030318
iter_ 800: 0.028802
iter_ 900: 0.027362
iter_ 1000: 0.025994
test 3 result: -2.524451035823933e-09

SGD¶