ValueError: Variable topic_embedding already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:
I get this error when pretrained_embeddings=None
m = model(num_docs,
vocab_size,
num_topics=num_topics,
#embedding_size=embed_size,
restore=False,
#logdir="/data/",
pretrained_embeddings=None,
freqs=freqs)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-11-2552ac324704> in <module>()
25 #logdir="/data/",
26 pretrained_embeddings=None,
---> 27 freqs=freqs)
28
29 m.train(pivot_ids,target_ids,doc_ids, len(pivot_ids), num_epochs, idx_to_word=idx_to_word, switch_loss_epoch=5)
~/Lda2vec-Tensorflow/lda2vec/Lda2vec.py in __init__(self, num_unique_documents, vocab_size, num_topics, freqs, save_graph_def, embedding_size, num_sampled, learning_rate, lmbda, alpha, power, batch_size, logdir, restore, fixed_words, factors_in, pretrained_embeddings)
76 power=self.power)
77 # Initialize the Topic-Document Mixture
---> 78 self.mixture = M.EmbedMixture(self.num_unique_documents, self.num_topics, self.embedding_size)
79
80
~/Lda2vec-Tensorflow/lda2vec/embedding_mixture.py in __init__(self, n_documents, n_topics, n_dim, temperature, W_in, factors_in, name)
27 self.topic_embedding = tf.get_variable('topic_embedding', shape=[n_topics, n_dim],
28 dtype=tf.float32,
---> 29 initializer=tf.orthogonal_initializer(gain=scalar)) if factors_in is None else factors_in
30
31
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py in get_variable(name, shape, dtype, initializer, regularizer, trainable, collections, caching_device, partitioner, validate_shape, use_resource, custom_getter, constraint, synchronization, aggregation)
1485 constraint=constraint,
1486 synchronization=synchronization,
-> 1487 aggregation=aggregation)
1488
1489
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py in get_variable(self, var_store, name, shape, dtype, initializer, regularizer, reuse, trainable, collections, caching_device, partitioner, validate_shape, use_resource, custom_getter, constraint, synchronization, aggregation)
1235 constraint=constraint,
1236 synchronization=synchronization,
-> 1237 aggregation=aggregation)
1238
1239 def _get_partitioned_variable(self,
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py in get_variable(self, name, shape, dtype, initializer, regularizer, reuse, trainable, collections, caching_device, partitioner, validate_shape, use_resource, custom_getter, constraint, synchronization, aggregation)
538 constraint=constraint,
539 synchronization=synchronization,
--> 540 aggregation=aggregation)
541
542 def _get_partitioned_variable(self,
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py in _true_getter(name, shape, dtype, initializer, regularizer, reuse, trainable, collections, caching_device, partitioner, validate_shape, use_resource, constraint, synchronization, aggregation)
490 constraint=constraint,
491 synchronization=synchronization,
--> 492 aggregation=aggregation)
493
494 # Set trainable value based on synchronization value.
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/variable_scope.py in _get_single_variable(self, name, shape, dtype, initializer, regularizer, partition_info, reuse, trainable, collections, caching_device, validate_shape, use_resource, constraint, synchronization, aggregation)
859 "reuse=tf.AUTO_REUSE in VarScope? "
860 "Originally defined at:\n\n%s" % (
--> 861 name, "".join(traceback.format_list(tb))))
862 found_var = self._vars[name]
863 if not shape.is_compatible_with(found_var.get_shape()):
ValueError: Variable topic_embedding already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:
File "/home/ubuntu/Lda2vec-Tensorflow/lda2vec/embedding_mixture.py", line 29, in __init__
initializer=tf.orthogonal_initializer(gain=scalar)) if factors_in is None else factors_in
File "/home/ubuntu/Lda2vec-Tensorflow/lda2vec/Lda2vec.py", line 78, in __init__
self.mixture = M.EmbedMixture(self.num_unique_documents, self.num_topics, self.embedding_size)
File "<ipython-input-8-6f2c3ffe8774>", line 27, in <module>
freqs=freqs
Does the twenty newsgroups example work when you set pretrained_embeddings=False right now? There weren't any errors for me yesterday.
I'm assuming you pulled since my commit last night?
Yes, I did a pull this morning. Here’s my results from twenty_newsgroups:
import pandas as pd from lda2vec.nlppipe import Preprocessor
Data directory
data_dir ="data"
Where to save preprocessed data
clean_data_dir = "data/clean_data_twenty_newsgroups"
Name of input file. Should be inside of data_dir
input_file = "20_newsgroups.txt"
Should we load pretrained embeddings from file
load_embeds = True
Read in data file
df = pd.read_csv(data_dir+"/"+input_file, sep="\t")
Initialize a preprocessor
P = Preprocessor(df, "texts", max_features=30000, maxlen=10000, min_count=30, nlp="en_core_web_lg")
Run the preprocessing on your dataframe
P.preprocess()
Load embeddings from file if we choose to do so
if load_embeds: # Load embedding matrix from file path - change path to where you saved them embedding_matrix = P.load_glove("glove.6B.300d.txt") else: embedding_matrix = None
Save data to data_dir
P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)
from lda2vec import utils, model
Path to preprocessed data
data_path = "data/clean_data_twenty_newsgroups"
Whether or not to load saved embeddings file
load_embeds = True
Load data from files
(idx_to_word, word_to_idx, freqs, pivot_ids, target_ids, doc_ids, embed_matrix) = utils.load_preprocessed_data(data_path, load_embed_matrix=load_embeds)
Number of unique documents
num_docs = doc_ids.max() + 1
Number of unique words in vocabulary (int)
vocab_size = len(freqs)
Embed layer dimension size
If not loading embeds, change 128 to whatever size you want.
embed_size = embed_matrix.shape[1] if load_embeds else 128
Number of topics to cluster into
num_topics = 20
Amount of iterations over entire dataset
num_epochs = 200
Batch size - Increase/decrease depending on memory usage
batch_size = 500
Epoch that we want to "switch on" LDA loss
switch_loss_epoch = 0
Pretrained embeddings value
pretrained_embeddings = embed_matrix if load_embeds else None
If True, save logdir, otherwise don't
save_graph = True
Initialize the model
m = model(num_docs, vocab_size, num_topics, embedding_size=embed_size, pretrained_embeddings=pretrained_embeddings, freqs=freqs, batch_size = batch_size, save_graph_def=save_graph)
Train the model
m.train(pivot_ids, target_ids, doc_ids, len(pivot_ids), num_epochs, idx_to_word=idx_to_word, switch_loss_epoch=switch_loss_epoch)
Visualize topics with pyldavis
utils.generate_ldavis_data(data_path, m, idx_to_word, freqs, vocab_size)
InvalidArgumentError Traceback (most recent call last) ~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args) 1333 try: -> 1334 return fn(*args) 1335 except errors.OpError as e:
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata) 1318 return self._call_tf_sessionrun( -> 1319 options, feed_dict, fetch_list, target_list, run_metadata) 1320
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata) 1406 self._session, options, feed_dict, fetch_list, target_list, -> 1407 run_metadata) 1408
InvalidArgumentError: indices[0] = 5451 is not in [0, 5451) [[{{node nce_loss/negative_sampling/nce_loss/embedding_lookup}} = GatherV2[Taxis=DT_INT32, Tindices=DT_INT64, Tparams=DT_FLOAT, _class=["loc:@Optimizer/train/update_nce_weights/AssignSub"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](nce_weights/read, nce_loss/negative_sampling/nce_loss/concat, nce_loss/negative_sampling/nce_loss/embedding_lookup/axis)]]
During handling of the above exception, another exception occurred:
InvalidArgumentError Traceback (most recent call last)
~/Lda2vec-Tensorflow/lda2vec/Lda2vec.py in train(self, pivot_words, target_words, doc_ids, data_size, num_epochs, switch_loss_epoch, save_every, report_every, print_topics_every, idx_to_word) 244 245 # Run a step of the model --> 246 summary, _, l, lw2v, llda, step = self.sesh.run(fetches, feed_dict=feed_dict) 247 248 # Prints log every "report_every" epoch
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata) 927 try: 928 result = self._run(None, fetches, feed_dict, options_ptr, --> 929 run_metadata_ptr) 930 if run_metadata: 931 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata) 1150 if final_fetches or final_targets or (handle and feed_dict_tensor): 1151 results = self._do_run(handle, final_targets, final_fetches, -> 1152 feed_dict_tensor, options, run_metadata) 1153 else: 1154 results = []
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata) 1326 if handle is None: 1327 return self._do_call(_run_fn, feeds, fetches, targets, options, -> 1328 run_metadata) 1329 else: 1330 return self._do_call(_prun_fn, handle, feeds, fetches)
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args) 1346 pass 1347 message = error_interpolation.interpolate(message, self._graph) -> 1348 raise type(e)(node_def, op, message) 1349 1350 def _extend_graph(self):
InvalidArgumentError: indices[0] = 5451 is not in [0, 5451) [[node nce_loss/negative_sampling/nce_loss/embedding_lookup (defined at /home/ubuntu/Lda2vec-Tensorflow/lda2vec/word_embedding.py:46) = GatherV2[Taxis=DT_INT32, Tindices=DT_INT64, Tparams=DT_FLOAT, _class=["loc:@Optimizer/train/update_nce_weights/AssignSub"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](nce_weights/read, nce_loss/negative_sampling/nce_loss/concat, nce_loss/negative_sampling/nce_loss/embedding_lookup/axis)]]
Caused by op 'nce_loss/negative_sampling/nce_loss/embedding_lookup', defined at:
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"main", mod_spec)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in
InvalidArgumentError (see above for traceback): indices[0] = 5451 is not in [0, 5451) [[node nce_loss/negative_sampling/nce_loss/embedding_lookup (defined at /home/ubuntu/Lda2vec-Tensorflow/lda2vec/word_embedding.py:46) = GatherV2[Taxis=DT_INT32, Tindices=DT_INT64, Tparams=DT_FLOAT, _class=["loc:@Optimizer/train/update_nce_weights/AssignSub"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](nce_weights/read, nce_loss/negative_sampling/nce_loss/concat, nce_loss/negative_sampling/nce_loss/embedding_lookup/axis)]]
On Mar 25, 2019, at 10:11 AM, Nathan Raw [email protected] wrote:
Does the twenty newsgroups example work when you set pretrained_embeddings=False right now? There weren't any errors for me yesterday.
I'm assuming you pulled since my commit last night?
— You are receiving this because you authored the thread. Reply to this email directly, view it on GitHub https://github.com/nateraw/Lda2vec-Tensorflow/issues/40#issuecomment-476293600, or mute the thread https://github.com/notifications/unsubscribe-auth/AC9i28KC1DGj7rrzurEfExTCheXcnbErks5vaQNMgaJpZM4cHWDr.
tensorflow.version '1.12.0'
On Mar 25, 2019, at 10:30 AM, David Laxer [email protected] wrote:
Yes, I did a pull this morning. Here’s my results from twenty_newsgroups:
import pandas as pd from lda2vec.nlppipe import Preprocessor
Data directory
data_dir ="data"
Where to save preprocessed data
clean_data_dir = "data/clean_data_twenty_newsgroups"
Name of input file. Should be inside of data_dir
input_file = "20_newsgroups.txt"
Should we load pretrained embeddings from file
load_embeds = True
Read in data file
df = pd.read_csv(data_dir+"/"+input_file, sep="\t")
Initialize a preprocessor
P = Preprocessor(df, "texts", max_features=30000, maxlen=10000, min_count=30, nlp="en_core_web_lg")
Run the preprocessing on your dataframe
P.preprocess()
Load embeddings from file if we choose to do so
if load_embeds: # Load embedding matrix from file path - change path to where you saved them embedding_matrix = P.load_glove("glove.6B.300d.txt") else: embedding_matrix = None
Save data to data_dir
P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)
from lda2vec import utils, model
Path to preprocessed data
data_path = "data/clean_data_twenty_newsgroups"
Whether or not to load saved embeddings file
load_embeds = True
Load data from files
(idx_to_word, word_to_idx, freqs, pivot_ids, target_ids, doc_ids, embed_matrix) = utils.load_preprocessed_data(data_path, load_embed_matrix=load_embeds)
Number of unique documents
num_docs = doc_ids.max() + 1
Number of unique words in vocabulary (int)
vocab_size = len(freqs)
Embed layer dimension size
If not loading embeds, change 128 to whatever size you want.
embed_size = embed_matrix.shape[1] if load_embeds else 128
Number of topics to cluster into
num_topics = 20
Amount of iterations over entire dataset
num_epochs = 200
Batch size - Increase/decrease depending on memory usage
batch_size = 500
Epoch that we want to "switch on" LDA loss
switch_loss_epoch = 0
Pretrained embeddings value
pretrained_embeddings = embed_matrix if load_embeds else None
If True, save logdir, otherwise don't
save_graph = True
Initialize the model
m = model(num_docs, vocab_size, num_topics, embedding_size=embed_size, pretrained_embeddings=pretrained_embeddings, freqs=freqs, batch_size = batch_size, save_graph_def=save_graph)
Train the model
m.train(pivot_ids, target_ids, doc_ids, len(pivot_ids), num_epochs, idx_to_word=idx_to_word, switch_loss_epoch=switch_loss_epoch)
Visualize topics with pyldavis
utils.generate_ldavis_data(data_path, m, idx_to_word, freqs, vocab_size)
InvalidArgumentError Traceback (most recent call last) ~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args) 1333 try: -> 1334 return fn(*args) 1335 except errors.OpError as e:
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run_fn(feed_dict, fetch_list, target_list, options, run_metadata) 1318 return self._call_tf_sessionrun( -> 1319 options, feed_dict, fetch_list, target_list, run_metadata) 1320
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list, run_metadata) 1406 self._session, options, feed_dict, fetch_list, target_list, -> 1407 run_metadata) 1408
InvalidArgumentError: indices[0] = 5451 is not in [0, 5451) [[{{node nce_loss/negative_sampling/nce_loss/embedding_lookup}} = GatherV2[Taxis=DT_INT32, Tindices=DT_INT64, Tparams=DT_FLOAT, _class=["loc:@Optimizer/train/update_nce_weights/AssignSub"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](nce_weights/read, nce_loss/negative_sampling/nce_loss/concat, nce_loss/negative_sampling/nce_loss/embedding_lookup/axis)]]
During handling of the above exception, another exception occurred:
InvalidArgumentError Traceback (most recent call last)
in () 48 num_epochs, 49 idx_to_word=idx_to_word, ---> 50 switch_loss_epoch=switch_loss_epoch) 51 52 # Visualize topics with pyldavis ~/Lda2vec-Tensorflow/lda2vec/Lda2vec.py in train(self, pivot_words, target_words, doc_ids, data_size, num_epochs, switch_loss_epoch, save_every, report_every, print_topics_every, idx_to_word) 244 245 # Run a step of the model --> 246 summary, _, l, lw2v, llda, step = self.sesh.run(fetches, feed_dict=feed_dict) 247 248 # Prints log every "report_every" epoch
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in run(self, fetches, feed_dict, options, run_metadata) 927 try: 928 result = self._run(None, fetches, feed_dict, options_ptr, --> 929 run_metadata_ptr) 930 if run_metadata: 931 proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _run(self, handle, fetches, feed_dict, options, run_metadata) 1150 if final_fetches or final_targets or (handle and feed_dict_tensor): 1151 results = self._do_run(handle, final_targets, final_fetches, -> 1152 feed_dict_tensor, options, run_metadata) 1153 else: 1154 results = []
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata) 1326 if handle is None: 1327 return self._do_call(_run_fn, feeds, fetches, targets, options, -> 1328 run_metadata) 1329 else: 1330 return self._do_call(_prun_fn, handle, feeds, fetches)
~/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/client/session.py in _do_call(self, fn, *args) 1346 pass 1347 message = error_interpolation.interpolate(message, self._graph) -> 1348 raise type(e)(node_def, op, message) 1349 1350 def _extend_graph(self):
InvalidArgumentError: indices[0] = 5451 is not in [0, 5451) [[node nce_loss/negative_sampling/nce_loss/embedding_lookup (defined at /home/ubuntu/Lda2vec-Tensorflow/lda2vec/word_embedding.py:46) = GatherV2[Taxis=DT_INT32, Tindices=DT_INT64, Tparams=DT_FLOAT, _class=["loc:@Optimizer/train/update_nce_weights/AssignSub"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](nce_weights/read, nce_loss/negative_sampling/nce_loss/concat, nce_loss/negative_sampling/nce_loss/embedding_lookup/axis)]]
Caused by op 'nce_loss/negative_sampling/nce_loss/embedding_lookup', defined at: File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/runpy.py", line 193, in _run_module_as_main "main", mod_spec) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/runpy.py", line 85, in _run_code exec(code, run_globals) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in
app.launch_new_instance() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance app.start() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 486, in start self.io http://self.io/_loop.start() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tornado/platform/asyncio.py", line 127, in start self.asyncio_loop.run_forever() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/asyncio/base_events.py", line 427, in run_forever self._run_once() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/asyncio/base_events.py", line 1440, in _run_once handle._run() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/asyncio/events.py", line 145, in _run self._callback(*self._args) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tornado/ioloop.py", line 759, in _run_callback ret = callback() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper return fn(*args, **kwargs) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 536, in self.io http://self.io/_loop.add_callback(lambda : self._handle_events(self.socket, 0)) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events self._handle_recv() File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv self._run_callback(callback, msg) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback callback(*args, **kwargs) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tornado/stack_context.py", line 276, in null_wrapper return fn(*args, **kwargs) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher return self.dispatch_shell(stream, msg) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell handler(stream, idents, msg) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request user_expressions, allow_stdin) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute res = shell.run_cell(code, store_history=store_history, silent=silent) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 537, in run_cell return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2662, in run_cell raw_cell, store_history, silent, shell_futures) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2785, in _run_cell interactivity=interactivity, compiler=compiler, result=result) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2903, in run_ast_nodes if self.run_code(code, result): File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2963, in run_code exec(code_obj, self.user_global_ns, self.user_ns) File " ", line 41, in save_graph_def=save_graph) File "/home/ubuntu/Lda2vec-Tensorflow/lda2vec/Lda2vec.py", line 82, in init handles = self._build_graph() File "/home/ubuntu/Lda2vec-Tensorflow/lda2vec/Lda2vec.py", line 162, in _build_graph loss_word2vec = self.w_embed(context, y) File "/home/ubuntu/Lda2vec-Tensorflow/lda2vec/word_embedding.py", line 46, in call sampled_values=sampler)) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py", line 1248, in nce_loss name=name) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/nn_impl.py", line 1062, in _compute_sampled_logits weights, all_ids, partition_strategy=partition_strategy) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/embedding_ops.py", line 313, in embedding_lookup transform_fn=None) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/embedding_ops.py", line 133, in _embedding_lookup_and_transform result = _clip(array_ops.gather(params[0], ids, name=name), File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/array_ops.py", line 2675, in gather return gen_array_ops.gather_v2(params, indices, axis, name=name) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 3332, in gather_v2 "GatherV2", params=params, indices=indices, axis=axis, name=name) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper op_def=op_def) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func return func(*args, **kwargs) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3274, in create_op op_def=op_def) File "/home/ubuntu/anaconda/envs/ai/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1770, in init self._traceback = tf_stack.extract_stack() InvalidArgumentError (see above for traceback): indices[0] = 5451 is not in [0, 5451) [[node nce_loss/negative_sampling/nce_loss/embedding_lookup (defined at /home/ubuntu/Lda2vec-Tensorflow/lda2vec/word_embedding.py:46) = GatherV2[Taxis=DT_INT32, Tindices=DT_INT64, Tparams=DT_FLOAT, _class=["loc:@Optimizer/train/update_nce_weights/AssignSub"], _device="/job:localhost/replica:0/task:0/device:CPU:0"](nce_weights/read, nce_loss/negative_sampling/nce_loss/concat, nce_loss/negative_sampling/nce_loss/embedding_lookup/axis)]]
On Mar 25, 2019, at 10:11 AM, Nathan Raw <[email protected] mailto:[email protected]> wrote:
Does the twenty newsgroups example work when you set pretrained_embeddings=False right now? There weren't any errors for me yesterday.
I'm assuming you pulled since my commit last night?
— You are receiving this because you authored the thread. Reply to this email directly, view it on GitHub https://github.com/nateraw/Lda2vec-Tensorflow/issues/40#issuecomment-476293600, or mute the thread https://github.com/notifications/unsubscribe-auth/AC9i28KC1DGj7rrzurEfExTCheXcnbErks5vaQNMgaJpZM4cHWDr.
For my own sanity, recommenting for you so I can format with markdown. Can't reformat email replies
import pandas as pd
from lda2vec.nlppipe import Preprocessor
# Data directory
data_dir ="data"
# Where to save preprocessed data
clean_data_dir = "data/clean_data_twenty_newsgroups"
# Name of input file. Should be inside of data_dir
input_file = "20_newsgroups.txt"
# Should we load pretrained embeddings from file
load_embeds = True
# Read in data file
df = pd.read_csv(data_dir+"/"+input_file, sep="\t")
# Initialize a preprocessor
P = Preprocessor(df, "texts", max_features=30000, maxlen=10000, min_count=30, nlp="en_core_web_lg")
# Run the preprocessing on your dataframe
P.preprocess()
# Load embeddings from file if we choose to do so
if load_embeds:
# Load embedding matrix from file path - change path to where you saved them
embedding_matrix = P.load_glove("glove.6B.300d.txt")
else:
embedding_matrix = None
# Save data to data_dir
P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)
from lda2vec import utils, model
# Path to preprocessed data
data_path = "data/clean_data_twenty_newsgroups"
# Whether or not to load saved embeddings file
load_embeds = True
# Load data from files
(idx_to_word, word_to_idx, freqs, pivot_ids,
target_ids, doc_ids, embed_matrix) = utils.load_preprocessed_data(data_path, load_embed_matrix=load_embeds)
# Number of unique documents
num_docs = doc_ids.max() + 1
# Number of unique words in vocabulary (int)
vocab_size = len(freqs)
# Embed layer dimension size
# If not loading embeds, change 128 to whatever size you want.
embed_size = embed_matrix.shape[1] if load_embeds else 128
# Number of topics to cluster into
num_topics = 20
# Amount of iterations over entire dataset
num_epochs = 200
# Batch size - Increase/decrease depending on memory usage
batch_size = 500
# Epoch that we want to "switch on" LDA loss
switch_loss_epoch = 0
# Pretrained embeddings value
pretrained_embeddings = embed_matrix if load_embeds else None
# If True, save logdir, otherwise don't
save_graph = True
# Initialize the model
m = model(num_docs,
vocab_size,
num_topics,
embedding_size=embed_size,
pretrained_embeddings=pretrained_embeddings,
freqs=freqs,
batch_size = batch_size,
save_graph_def=save_graph)
# Train the model
m.train(pivot_ids,
target_ids,
doc_ids,
len(pivot_ids),
num_epochs,
idx_to_word=idx_to_word,
switch_loss_epoch=switch_loss_epoch)
# Visualize topics with pyldavis
utils.generate_ldavis_data(data_path, m, idx_to_word, freqs, vocab_size)
Sorry! I’ll paste directly into github.
On Mar 25, 2019, at 10:55 AM, Nathan Raw [email protected] wrote:
For my own sanity, recommenting for you so I can format with markdown. Can't reformat email replies
import pandas as pd from lda2vec.nlppipe import Preprocessor
Data directory
data_dir ="data"
Where to save preprocessed data
clean_data_dir = "data/clean_data_twenty_newsgroups"
Name of input file. Should be inside of data_dir
input_file = "20_newsgroups.txt"
Should we load pretrained embeddings from file
load_embeds = True
Read in data file
df = pd.read_csv(data_dir+"/"+input_file, sep="\t")
Initialize a preprocessor
P = Preprocessor(df, "texts", max_features=30000, maxlen=10000, min_count=30, nlp="en_core_web_lg")
Run the preprocessing on your dataframe
P.preprocess()
Load embeddings from file if we choose to do so
if load_embeds: # Load embedding matrix from file path - change path to where you saved them embedding_matrix = P.load_glove("glove.6B.300d.txt") else: embedding_matrix = None
Save data to data_dir
P.save_data(clean_data_dir, embedding_matrix=embedding_matrix)
from lda2vec import utils, model
Path to preprocessed data
data_path = "data/clean_data_twenty_newsgroups"
Whether or not to load saved embeddings file
load_embeds = True
Load data from files
(idx_to_word, word_to_idx, freqs, pivot_ids, target_ids, doc_ids, embed_matrix) = utils.load_preprocessed_data(data_path, load_embed_matrix=load_embeds)
Number of unique documents
num_docs = doc_ids.max() + 1
Number of unique words in vocabulary (int)
vocab_size = len(freqs)
Embed layer dimension size
If not loading embeds, change 128 to whatever size you want.
embed_size = embed_matrix.shape[1] if load_embeds else 128
Number of topics to cluster into
num_topics = 20
Amount of iterations over entire dataset
num_epochs = 200
Batch size - Increase/decrease depending on memory usage
batch_size = 500
Epoch that we want to "switch on" LDA loss
switch_loss_epoch = 0
Pretrained embeddings value
pretrained_embeddings = embed_matrix if load_embeds else None
If True, save logdir, otherwise don't
save_graph = True
Initialize the model
m = model(num_docs, vocab_size, num_topics, embedding_size=embed_size, pretrained_embeddings=pretrained_embeddings, freqs=freqs, batch_size = batch_size, save_graph_def=save_graph)
Train the model
m.train(pivot_ids, target_ids, doc_ids, len(pivot_ids), num_epochs, idx_to_word=idx_to_word, switch_loss_epoch=switch_loss_epoch)
Visualize topics with pyldavis
utils.generate_ldavis_data(data_path, m, idx_to_word, freqs, vocab_size) — You are receiving this because you authored the thread. Reply to this email directly, view it on GitHub https://github.com/nateraw/Lda2vec-Tensorflow/issues/40#issuecomment-476310785, or mute the thread https://github.com/notifications/unsubscribe-auth/AC9i2zr5UL7LjL89uB_bAkFs9tcnV2Ibks5vaQ2QgaJpZM4cHWDr.
No worries 😄
At work right now so I can't really help. Just adding this to my todo list tonight. Will get back to you. Sorry that this stuff is always breaking 🙁 . Sooo many improvements lately but they came with many more bugs.
Also, just to be clear, this is all supposed to be run on Tensorflow 1.5.0.
Was running into the same error just now. The TF Version was the problem, with 1.5.0 it's working!