티스토리 뷰

<Preparation>

  • Chrome driver
  • "pip install selenium"
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import datetime
from random import *
from weasyprint import HTML


class cNaverSearch:
    EXECUTABLE_PATH = ".../ChromeDriver/chromedriver.exe"

    def __init__(self):
        pass

    def GetLinkPage(self, linkAddress):
        try:
            webDriver = webdriver.Chrome(executable_path=self.EXECUTABLE_PATH)
            webDriver.get(linkAddress)
        except:
            returnMessage = {
                'title':'NONE',
                'body':'NONE',
            }

            return returnMessage
        validPage = True

        try:
            titleTag = webDriver.find_element_by_tag_name('title')
            titleAtt = titleTag.get_attribute('innerHTML')
            titleString = titleAtt.replace("\n", "")
            print("[titleS]", titleString)
            if len(titleString) > 10:
                title1 = titleString[2:10]
            else:
                title1 = titleString
        except:
            titleString = "NONE"
            title1 = str(randint(1,100))
            print("[Excepted]-TITLE")
            validPage = False

        try:
            bodyTag = webDriver.find_element_by_tag_name('body')
            bodyString = bodyTag.text
            #print("[body]", bodyString)
        except:
            print("[Excepted]-BODY")
            validPage = False
        filename1 = 'NONE'
        if validPage:        
            date0 = datetime.datetime.now()
            datename1 = date0.strftime("%Y%m%d%H%M%S")
            filename1 = "./PDFDocus/"+title1+"_"+datename1+".pdf"
            HTML(linkAddress).write_pdf(filename1)
            print("PDF saved", linkAddress)
        
        returnMessage = {
            'title':titleString,
            'body':bodyString,
            'pdffilename':filename1
        }
        webDriver.close()
        return returnMessage

    def GetLinkPageInfo(self, linkAddress):
        try:
            webDriver = webdriver.Chrome(executable_path=self.EXECUTABLE_PATH)
            print("[NAVER-2]", linkAddress)
            webDriver.get(linkAddress)
        except:
            returnMessage = {
                'title':'NONE',
                'body':'NONE',
            }
            return returnMessage
        validPage = True

        try:
            titleTag = webDriver.find_element_by_tag_name('title')
            titleAtt = titleTag.get_attribute('innerHTML')
            titleString = titleAtt.replace("\n", "")
            #print("[titleS]", titleString)
        except:
            titleString = "NONE"
            print("[Excepted]-TITLE")
            validPage = False

        try:
            bodyTag = webDriver.find_element_by_tag_name('body')
            bodyString = bodyTag.text
            #print("[body]", bodyString)
        except:
            print("[Excepted]-BODY")
            validPage = False
        filename1 = 'Invalid'
        returnMessage = {
            'title':titleString,
            'body':bodyString,
            'pdffilename':filename1
        }
        webDriver.close()
        return returnMessage

    def StartNaverNewsSearch(self, keyWord):
        webDriver = webdriver.Chrome(executable_path=self.EXECUTABLE_PATH)
        webDriver.get('https://search.naver.com/search.naver?where=news&sm=tab_jum&query='+keyWord)
        elem = webDriver.find_element_by_name('query')
        elem.clear()
        elem.send_keys(keyWord)
        elem.send_keys(Keys.RETURN)
        Items = webDriver.find_element_by_css_selector('.sc_page_inner')
        items1 = Items.find_elements_by_tag_name('a')
        count1 = 1
        returnMessages = []
        for item1 in items1:
            linkAddress1 = item1.get_attribute('href')
            print("[NAVER-1", linkAddress1)
            Rsts1 = self.SearchEachPage(linkAddress1)
            for Rst1 in Rsts1:
                returnMessages.append(Rst1)

        webDriver.close()
        return returnMessages

    def SearchEachPage(self, linkAddress):
        NAVERSEARCHADDRESS_0 = "https://search.naver.com/search.naver"
        webDriver = webdriver.Chrome(executable_path=self.EXECUTABLE_PATH)
        webDriver.get(linkAddress)
        print(linkAddress)
        Items = webDriver.find_elements_by_css_selector('.news_tit')
        count1 = 1
        returnMessages = []
        for item1 in Items:
            linkAddress1 = item1.get_attribute('href')
            title1 = item1.get_attribute('title')
            print("[ADDRESS]", linkAddress1)
            print("[TITLE]", title1)
            pageInfo = self.GetLinkPageInfo(linkAddress1)
            #print(pageInfo["body"])
            message1 = {
                'address':linkAddress1,
                'title':title1,
                'body':pageInfo['body']
                }
            returnMessages.append(message1)

        webDriver.close()
        return returnMessages

반응형