Skip to content

Commit

Permalink
ibrahimsharaf#14 added doc2vec overriden abstract class methods
Browse files Browse the repository at this point in the history
  • Loading branch information
Ayatallah committed May 13, 2019
1 parent 1437686 commit 33730df
Showing 1 changed file with 39 additions and 0 deletions.
39 changes: 39 additions & 0 deletions models/doc2vec_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,42 @@ class doc2VecBuilder(ModelBuilder):

def __init__(self, corpus):
self.corpus = corpus


def initialize_model(self):
logging.info("Building Doc2Vec vocabulary")
self.model = doc2vec.Doc2Vec(min_count=1, # Ignores all words with total frequency lower than this
window=10,
# The maximum distance between the current and predicted word within a sentence
vector_size=300, # Dimensionality of the generated feature vectors
workers=5, # Number of worker threads to train the model
alpha=0.025, # The initial learning rate
min_alpha=0.00025, # Learning rate will linearly drop to min_alpha as training progresses
dm=1) # dm defines the training algorithm. If dm=1 means 'distributed memory' (PV-DM)
# and dm =0 means 'distributed bag of words' (PV-DBOW)
self.model.build_vocab(self.corpus)

def train_model(self):
logging.info("Training Doc2Vec model")
# 10 epochs take around 10 minutes on my machine (i7), if you have more time/computational power make it 20
for epoch in range(10):
logging.info('Training iteration #{0}'.format(epoch))
self.model.train(self.corpus, total_examples=self.model.corpus_count, epochs=self.model.epochs)
# shuffle the corpus
random.shuffle(self.corpus)
# decrease the learning rate
self.model.alpha -= 0.0002
# fix the learning rate, no decay
self.model.min_alpha = self.model.alpha

def save_model(self):
logging.info("Saving trained Doc2Vec model")
self.model.save("./classifiers/d2v.model")

def load_model(self):
logging.info("Loading trained Doc2Vec model")
d2v = Doc2Vec.load("./classifiers/d2v.model")
self.model = d2v
return d2v


0 comments on commit 33730df

Please sign in to comment.