본문 바로가기

텍스트 마이닝

토픽 모델링으로 주제 찾기 (3)

반응형

7.3 Gensim을 이용한 토픽 모델링

7.3.1 Gensim 사용법과 시각화

  • Gensim: 토픽 모델링을 비롯해 의미적인 자연어 처리를 위한 다양한 라이브러리
!pip install --upgrade gensim
import nltk

nltk.download('stopwords')

"""
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
True
"""
# 필요한 library들을 import
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

cachedStopWords = stopwords.words("english")

RegTok = RegexpTokenizer("[\w']{3,}") # 정규포현식으로 토크나이저를 정의
english_stops = set(stopwords.words('english')) #영어 불용어를 가져옴

def tokenizer(text):
    tokens = RegTok.tokenize(text.lower()) 
    # stopwords 제외
    words = [word for word in tokens if (word not in english_stops) and len(word) > 2]
    return words

texts = [tokenizer(news) for news in newsgroups_train.data]
from gensim.corpora.dictionary import Dictionary

# 토큰화 결과로부터 dictionay 생성
dictionary = Dictionary(texts)
print('#Number of initial unique words in documents:', len(dictionary))

# 문서 빈도수가 너무 적거나 높은 단어를 필터링하고 특성을 단어의 빈도 순으로 선택
dictionary.filter_extremes(keep_n=2000, no_below=5, no_above=0.5)
print('#Number of unique words after removing rare and common words:', len(dictionary))

# 카운트 벡터로 변환
corpus = [dictionary.doc2bow(text) for text in texts]
print('#Number of unique tokens: %d' % len(dictionary))
print('#Number of documents: %d' % len(corpus))

"""
#Number of initial unique words in documents: 46466
#Number of unique words after removing rare and common words: 2000
#Number of unique tokens: 2000
#Number of documents: 3219
"""
from gensim.models import LdaModel

num_topics = 10
passes = 5
%time model = LdaModel(corpus=corpus, id2word=dictionary,\
                       passes=passes, num_topics=num_topics, \
                       random_state=7)
                       
"""
WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy
CPU times: user 22.3 s, sys: 111 ms, total: 22.4 s
Wall time: 29.7 s
"""
model.print_topics(num_words=10)

"""
[(0,
  '0.023*"com" + 0.018*"keith" + 0.016*"caltech" + 0.013*"sgi" + 0.013*"nntp" + 0.013*"posting" + 0.013*"host" + 0.012*"would" + 0.012*"system" + 0.011*"livesey"'),
 (1,
  '0.020*"morality" + 0.018*"objective" + 0.015*"one" + 0.015*"say" + 0.014*"uiuc" + 0.012*"frank" + 0.012*"values" + 0.010*"faq" + 0.010*"article" + 0.008*"cso"'),
 (2,
  '0.026*"com" + 0.025*"access" + 0.025*"posting" + 0.023*"host" + 0.023*"nntp" + 0.017*"digex" + 0.015*"article" + 0.013*"cwru" + 0.013*"___" + 0.013*"net"'),
 (3,
  '0.021*"university" + 0.017*"posting" + 0.015*"host" + 0.015*"nntp" + 0.013*"article" + 0.010*"com" + 0.009*"know" + 0.009*"i\'m" + 0.009*"would" + 0.009*"thanks"'),
 (4,
  '0.032*"com" + 0.015*"would" + 0.011*"article" + 0.010*"one" + 0.010*"get" + 0.009*"people" + 0.009*"ibm" + 0.008*"government" + 0.007*"good" + 0.007*"i\'m"'),
 (5,
  '0.025*"key" + 0.017*"encryption" + 0.014*"clipper" + 0.014*"chip" + 0.009*"keys" + 0.009*"use" + 0.008*"security" + 0.008*"government" + 0.008*"public" + 0.007*"escrow"'),
 (6,
  '0.024*"scsi" + 0.024*"drive" + 0.013*"com" + 0.012*"ide" + 0.011*"controller" + 0.010*"bus" + 0.010*"card" + 0.010*"disk" + 0.009*"one" + 0.009*"drives"'),
 (7,
  '0.017*"graphics" + 0.012*"image" + 0.012*"ftp" + 0.011*"file" + 0.010*"files" + 0.009*"available" + 0.009*"data" + 0.009*"pub" + 0.008*"software" + 0.008*"use"'),
 (8,
  '0.014*"god" + 0.013*"people" + 0.012*"one" + 0.009*"would" + 0.007*"jesus" + 0.007*"com" + 0.007*"think" + 0.006*"many" + 0.006*"even" + 0.006*"say"'),
 (9,
  '0.033*"space" + 0.019*"nasa" + 0.009*"gov" + 0.007*"first" + 0.007*"launch" + 0.006*"moon" + 0.006*"earth" + 0.006*"orbit" + 0.006*"shuttle" + 0.006*"would"')]
"""
print("#topic distribution of the first document: ", model.get_document_topics(corpus)[0])

"""
#topic distribution of the first document:  [(0, 0.72576934), (8, 0.2699505)]
"""
!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

# feed the LDA model into the pyLDAvis instance
lda_viz = gensimvis.prepare(model, corpus, dictionary)
lda_viz

 

 

 

 

 

 

※ 해당 내용은 <파이썬 텍스트 마이닝 완벽 가이드>의 내용을 토대로 학습하며 정리한 내용입니다.

반응형