Repository URL to install this package:
|
Version:
0.4.15.dev1+g34a2788 ▾
|
pipeline_name: topic_modeling
doc: Example topic-modeling end-to-end workflow.
steps:
- module_type: import.local.file_bundle
step_id: import_text_corpus
- module_type: create.table.from.file_bundle
step_id: create_text_corpus
input_links:
file_bundle: import_text_corpus.file_bundle
- module_type: table.pick.column
step_id: extract_texts_column
input_links:
table: create_text_corpus.table
- module_type: table.pick.column
step_id: extract_filename_column
input_links:
table: create_text_corpus.table
- module_type: parse.date_array
step_id: create_date_array
input_links:
array: extract_filename_column.array
- module_type: tokenize.texts_array
step_id: tokenize_content
input_links:
texts_array: extract_texts_column.array
- module_type: create.stopwords_list
step_id: create_stopwords_list
- module_type: preprocess.tokens_array
step_id: preprocess_corpus
input_links:
tokens_array: tokenize_content.tokens_array
remove_stopwords: create_stopwords_list.stopwords_list
- module_type: generate.LDA.for.tokens_array
step_id: generate_lda
input_links:
tokens_array: preprocess_corpus.tokens_array
input_aliases:
extract_texts_column.column_name: content_column_name
extract_filename_column.column_name: filename_column_name
import_text_corpus.path: text_corpus_folder_path
create_date_array.min_index: date_parse_min
create_date_array.max_index: date_parse_max
create_date_array.force_non_null: date_force_non_null
create_date_array.remove_tokens: date_remove_tokensl
tokenize_content.tokenize_by_word: tokenize_by_word
generate_lda.num_topics_min: num_topics_min
generate_lda.num_topics_max: num_topics_max
generate_lda.compute_coherence: compute_coherence
generate_lda.words_per_topic: words_per_topic
create_stopwords_list.languages: languages
create_stopwords_list.stopwords: stopwords
preprocess_corpus.to_lowercase: to_lowercase
preprocess_corpus.remove_alphanumeric: remove_alphanumeric
preprocess_corpus.remove_non_alpha: remove_non_alpha
preprocess_corpus.remove_all_numeric: remove_all_numeric
preprocess_corpus.remove_short_tokens: remove_short_tokens
preprocess_corpus.remove_stopwords: remove_stopwords
output_aliases:
import_text_corpus.file_bundle: text_corpus_file_bundle
create_text_corpus.table: text_corpus_table
extract_texts_column.array: content_array
tokenize_content.tokens_array: tokenized_corpus
preprocess_corpus.tokens_array: preprocessed_corpus
generate_lda.topic_models: topic_models
generate_lda.coherence_map: coherence_map
generate_lda.coherence_table: coherence_table
create_date_array.date_array: date_array
defaults:
content_column_name: "content"
filename_column_name: "file_name"
date_parse_min: 11
date_parse_max: 21
text_corpus_folder_path: "${pipeline_dir}/../data/text_corpus/data"
languages: ["italian"]
stopwords: []
num_topics_max: 7