SWDesk
[Python] Sentence Similarity
inhae
2021. 6. 29. 10:28
뉴스 제목, 기사 내용 등 문장의 유사도 분석을 위한 파이썬 소스 코드
def CheckSimilarity01():
import konlpy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
excel1 = cBExcel()
filePathname = "./TestData/NN_친환경 제조업.xlsx"
sheetName = "Test01"
data1 = excel1.LoadData(filePathname, sheetName)
titlesDF1 = data1['Title']
titleList =[]
twitterList = []
for title1 in titlesDF1:
titleList.append(title1)
okt = konlpy.tag.Okt()
twitter_nouns = ' '.join(okt.nouns(str(title1)))
print(twitter_nouns)
twitterList.append(twitter_nouns)
tfidf_vectorizer = TfidfVectorizer(min_df = 1)
tfidf_matrix_twitter = tfidf_vectorizer.fit_transform(twitterList)
similarity = cosine_similarity(tfidf_matrix_twitter, tfidf_matrix_twitter)
print(similarity)
index1 = -1
for title1 in titleList:
index1 += 1
similarity1 = similarity[index1][index1+1:]
index2 = index1
for sim1 in similarity1:
print("[sim1]", sim1)
index2 += 1
if sim1>0.1:
print(titleList[index2])
# cosSize, rowSize = similarity.size
plt.rc('font', family='Gothic')
sns.heatmap(similarity, xticklabels=titleList, yticklabels=titleList, cmap='viridis')
plt.show()
<분석 결과>

반응형