In [2]:
%matplotlib inline
from pymongo import MongoClient

from collections import Counter
from random import randint
from tqdm import tqdm

import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from keras.layers import Input, Dense
from keras.models import Model
from keras.callbacks import ProgbarLogger, Callback
Using TensorFlow backend.
In [3]:
from notebook.services.config import ConfigManager
cm = ConfigManager().update('notebook', {'limit_output': 1000000})

Check database

In [4]:
db = MongoClient()['friendzone']
mongo = db.users
In [5]:
mongo_train = db.users_train
In [6]:
for item in mongo_train.find():
    print(item)
    break
{'_id': 134217728, 'id': 134217728, 'first_name': 'Виктор', 'last_name': 'Куликов', 'sex': 2, 'bdate': '26.12', 'photo_big': 'https://pp.userapi.com/c630018/v630018728/26f0c/4VLW7JcJsXY.jpg', 'photo_max_orig': 'https://pp.userapi.com/c630018/v630018728/26f0d/ivKTRQc612k.jpg', 'can_see_audio': 0, 'last_seen': {'time': 1491719737, 'platform': 1}, 'absence': 153874.02688503265, 'groups': [38643478, 70226354, 14305889, 54154078, 140876144, 129618859, 64050816, 66814271, 38894284, 12382740, 45595714, 43387535, 58168203, 117431540, 41183770, 79637473, 40518256, 47252548, 88288156, 38127348, 2661911, 35061290, 81456984, 70570762, 63482848, 30555648, 64541139, 40390872, 39236729, 30277672, 36635754, 47710541, 98647548, 35319314, 56629893, 80345103, 28256828, 45844365, 72378974, 57293684, 41885467, 34491673], 'vector': [0.6142432389207699, -0.37688747342349604, -0.5780761858418278, 0.10631783819833811, -0.07250700860890576, 0.14404192937707014, -0.04636053835940341, -0.29426259184466996, -0.28150924090355955, 0.21694451084961752, -0.14951886491660965, -0.1543035093754817]}
In [77]:
mongo_train.find().count()
Out[77]:
1732575

Calculate vocab

Let's set a minimal subscriptions number for public in our vocab.

In [8]:
vocab = Counter()
for user in tqdm(mongo_train.find(), miniters=5000):
    for group in user['groups']:
        vocab[group] += 1
493836it [01:37, 5040.39it/s]
In [9]:
len(vocab)
Out[9]:
1494451
In [10]:
for z in [100, 500, 1000, 5000, 10000]:
    sns.distplot([x[1] for x in vocab.most_common(z)])
/home/sheldonai/anaconda3/lib/python3.6/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))
In [11]:
vocab_size = 500
vocab_list = [x[0] for x in vocab.most_common(vocab_size)]
In [12]:
# Calculate distribution of how many publics from vocab each user has
vocab_set = set(vocab_list)
public_count = []
for user in tqdm(mongo_train.find(), miniters=5000):
    i = 0
    for group in user['groups']:
        if group in vocab_set:
            i += 1
    public_count.append(i)
507001it [01:23, 6039.24it/s]
In [13]:
sns.distplot(public_count)
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f163090ea58>
/home/sheldonai/anaconda3/lib/python3.6/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))
In [14]:
np.median(public_count)
Out[14]:
17.0
In [15]:
del vocab, public_count

Create neural network

In [16]:
encoding_dim = 50
In [17]:
input_vec = Input(shape=(vocab_size,))
l1 = Dense(250, activation='relu')(input_vec)
encoded = Dense(encoding_dim, activation='relu')(l1)
l2 = Dense(250, activation='relu')(encoded)
decoded = Dense(vocab_size, activation='sigmoid')(l2)
In [18]:
autoencoder = Model(input_vec, decoded)
encoder = Model(input_vec, encoded)
In [19]:
encoded_input = Input(shape=(encoding_dim,))
decoder_l1 = autoencoder.layers[-2](encoded_input)
decoder_l2 = autoencoder.layers[-1](decoder_l1)
decoder = Model(encoded_input, decoder_l2)
In [20]:
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')
In [95]:
database_batch = 32
users_count = mongo_train.find().count()

def input_generator():
    while 1:
        items = list(mongo_train.aggregate([{
            '$sample': {'size': database_batch}
        }]))
        full_vectors = []
        for user in items:
            user_publics = set(user['groups'])
            user_vector = np.in1d(vocab_list, user['groups']).astype(int)
            #for i, vocab_public in enumerate(vocab_list):
            #    if vocab_public in user_publics:
            #        user_vector[i] = 1
            full_vectors.append(user_vector)
        full_vectors = np.array(full_vectors)
        yield full_vectors, full_vectors
In [22]:
for i, j in input_generator():
    vectors_validation = i
    break
In [88]:
class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.loss = []
        self.acc = []
        self.val_loss = []
        self.val_acc = []


    def on_epoch_end(self, batch, logs={}):
        print('Epoch end:', logs.get('loss'), logs.get('val_loss'))
        self.loss.append(logs.get('loss'))
        self.acc.append(logs.get('acc'))
        self.val_loss.append(logs.get('val_loss'))
        self.val_acc.append(logs.get('val_acc'))
In [ ]:
history = autoencoder.fit_generator(input_generator(),
                                    steps_per_epoch=50,
                                    epochs=1000,
                                    verbose=0,
                                    workers=1,
                                    callbacks=[LossHistory()],
                                    validation_data=(vectors_validation, vectors_validation))
In [63]:
history.history['loss']
Out[63]:
[0.074755788606125861,
 0.074453553999774158,
 0.072900072089396417,
 0.074311724456492811,
 0.076027901843190193,
 0.073506404238287359,
 0.075702646165154874,
 0.075165686546824872,
 0.072398370015434921,
 0.074507724493741989]
In [97]:
plt.plot(history.history['loss'])
Out[97]:
[<matplotlib.lines.Line2D at 0x7f161fd07160>]
/home/sheldonai/anaconda3/lib/python3.6/site-packages/matplotlib/font_manager.py:1297: UserWarning: findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))