티스토리 뷰

<Preparation>

  • Chrome driver
  • "pip install selenium"

 

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import datetime
from random import *
from weasyprint import HTML


class cDaumSearch:
    EXECUTABLE_PATH = ".../ChromeDriver/chromedriver.exe"

    def __init__(self):
        pass

    def GetLinkPage(self, linkAddress):
        try:
            webDriver = webdriver.Chrome(executable_path=self.EXECUTABLE_PATH)
            webDriver.get(linkAddress)
        except:
            returnMessage = {
                'title':'NONE',
                'body':'NONE',
            }

            return returnMessage
        validPage = True

        try:
            titleTag = webDriver.find_element_by_tag_name('title')
            titleAtt = titleTag.get_attribute('innerHTML')
            titleString = titleAtt.replace("\n", "")
            print("[titleS]", titleString)
            if len(titleString) > 10:
                title1 = titleString[2:10]
            else:
                title1 = titleString
        except:
            titleString = "NONE"
            title1 = str(randint(1,100))
            print("[Excepted]-TITLE")
            validPage = False

        try:
            bodyTag = webDriver.find_element_by_tag_name('body')
            bodyString = bodyTag.text
            #print("[body]", bodyString)
        except:
            print("[Excepted]-BODY")
            validPage = False
        filename1 = 'NONE'
        if validPage:        
            date0 = datetime.datetime.now()
            datename1 = date0.strftime("%Y%m%d%H%M%S")
            filename1 = "./PDFDocus/"+title1+"_"+datename1+".pdf"
            HTML(linkAddress).write_pdf(filename1)
            print("PDF saved", linkAddress)
        
        returnMessage = {
            'title':titleString,
            'body':bodyString,
            'pdffilename':filename1
        }
        webDriver.close()
        return returnMessage

    def GetLinkPageInfo(self, linkAddress):
        try:
            webDriver = webdriver.Chrome(executable_path=self.EXECUTABLE_PATH)
            print("[DAUM-2]", linkAddress)
            webDriver.get(linkAddress)
        except:
            returnMessage = {
                'title':'NONE',
                'body':'NONE',
            }
            return returnMessage
        validPage = True

        try:
            titleTag = webDriver.find_element_by_tag_name('title')
            titleAtt = titleTag.get_attribute('innerHTML')
            titleString = titleAtt.replace("\n", "")
            #print("[titleS]", titleString)
        except:
            titleString = "NONE"
            print("[Excepted]-TITLE")
            validPage = False

        try:
            bodyTag = webDriver.find_element_by_tag_name('body')
            bodyString = bodyTag.text
            #print("[body]", bodyString)
        except:
            print("[Excepted]-BODY")
            validPage = False
        filename1 = 'Invalid'
        returnMessage = {
            'title':titleString,
            'body':bodyString,
            'pdffilename':filename1
        }
        webDriver.close()
        return returnMessage

    def StartDaumNewsSearch(self, keyWord):
        #LINK_DAUMNEWS = "https://search.daum.net/search?w=news&DA=PGD&enc=utf8&cluster=y&cluster_page=1&q="
        LINK_DAUM = "https://www.daum.net"
        webDriver = webdriver.Chrome(executable_path=self.EXECUTABLE_PATH)
        #webDriver.get(LINK_DAUMNEWS+keyWord+"&p=1")
        webDriver.get(LINK_DAUM)
        elem1 = webDriver.find_element_by_css_selector('.tf_keyword')
        elem1.clear()
        elem1.send_keys(keyWord)
        elem1.send_keys(Keys.RETURN)
        elem2 = webDriver.find_element_by_css_selector('.tab_news')
        elem2.click()
        Items = webDriver.find_elements_by_css_selector('.f_link_b')
        count1 = 1
        returnMessages = []
        for item1 in Items:
            linkAddress1 = item1.get_attribute('href')
            print("[DAUM-1]", linkAddress1)
            pageInfo = self.GetLinkPageInfo(linkAddress1)
            #print(pageInfo["body"])
            message1 = {
                'address':linkAddress1,
                'title':pageInfo['title'],
                'body':pageInfo['body']
                }
            returnMessages.append(message1)

        Items = webDriver.find_element_by_css_selector('.paging_comm')
        items1 = Items.find_elements_by_tag_name('a')
        for item1 in items1:
            linkAddress1 = item1.get_attribute('href')
            Rsts1 = self.SearchEachPage(linkAddress1)
            for Rst1 in Rsts1:
                returnMessages.append(Rst1)

        webDriver.close()
        return returnMessages

    def SearchEachPage(self, linkAddress):
        webDriver = webdriver.Chrome(executable_path=self.EXECUTABLE_PATH)
        webDriver.get(linkAddress)
        print(linkAddress)
        Items = webDriver.find_elements_by_css_selector('.f_link_b')
        count1 = 1
        returnMessages = []
        for item1 in Items:
            linkAddress1 = item1.get_attribute('href')
            print("[ADDRESS]", linkAddress1)
            pageInfo = self.GetLinkPageInfo(linkAddress1)
            #print(pageInfo["body"])
            message1 = {
                'address':linkAddress1,
                'title':pageInfo['title'],
                'body':pageInfo['body']
                }
            returnMessages.append(message1)

        webDriver.close()
        return returnMessages

반응형
반응형
250x250
최근에 올라온 글
최근에 달린 댓글
Total
Today
Yesterday
링크
«   2025/01   »
1 2 3 4
5 6 7 8 9 10 11
12 13 14 15 16 17 18
19 20 21 22 23 24 25
26 27 28 29 30 31
글 보관함