티스토리 뷰

<Preparation>

  • Chrome driver
  • "pip install selenium"

 

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import datetime
from random import *
from weasyprint import HTML


class cGoogleSearch:
    EXECUTABLE_PATH = ".../ChromeDriver/chromedriver.exe"

    def __init__(self):
        pass

    def GetLinkPage(self, linkAddress):
        try:
            webDriver = webdriver.Chrome(executable_path=self.EXECUTABLE_PATH)
            print("[PAGE-1]", linkAddress)
            webDriver.get(linkAddress)
        except:
            returnMessage = {
                'title':'NONE',
                'body':'NONE',
            }

            return returnMessage


        validPage = True

        try:
            titleTag = webDriver.find_element_by_tag_name('title')
            titleAtt = titleTag.get_attribute('innerHTML')
            titleString = titleAtt.replace("\n", "")
            print("[titleS]", titleString)
            if len(titleString) > 10:
                title1 = titleString[2:10]
            else:
                title1 = titleString
        except:
            titleString = "NONE"
            title1 = str(randint(1,100))
            print("[Excepted]-TITLE")
            validPage = False

        try:
            bodyTag = webDriver.find_element_by_tag_name('body')
            bodyString = bodyTag.text
            #print("[body]", bodyString)
        except:
            print("[Excepted]-BODY")
            validPage = False
        filename1 = 'NONE'
        if validPage:        
            date0 = datetime.datetime.now()
            datename1 = date0.strftime("%Y%m%d%H%M%S")
            filename1 = "./PDFDocus/"+title1+"_"+datename1+".pdf"
            HTML(linkAddress).write_pdf(filename1)
            print("PDF saved", linkAddress)
        
        returnMessage = {
            'title':titleString,
            'body':bodyString,
            'pdffilename':filename1
        }
        webDriver.close()
        return returnMessage

    def GetLinkPageInfo(self, linkAddress):
        try:
            webDriver = webdriver.Chrome(executable_path=self.EXECUTABLE_PATH)
            print("[GOOGLE-2]", linkAddress)
            webDriver.get(linkAddress)
        except:
            returnMessage = {
                'title':'NONE',
                'body':'NONE',
            }
            return returnMessage
        validPage = True

        try:
            titleTag = webDriver.find_element_by_tag_name('title')
            titleAtt = titleTag.get_attribute('innerHTML')
            titleString = titleAtt.replace("\n", "")
            print("[titleS]", titleString)
        except:
            titleString = "NONE"
            print("[Excepted]-TITLE")
            validPage = False

        try:
            bodyTag = webDriver.find_element_by_tag_name('body')
            bodyString = bodyTag.text
            #print("[body]", bodyString)
        except:
            print("[Excepted]-BODY")
            validPage = False
        filename1 = 'Invalid'
        returnMessage = {
            'title':titleString,
            'body':bodyString,
            'pdffilename':filename1
        }
        webDriver.close()
        return returnMessage

    def StartGoogleSearch(self, keyWord):
        returnMessages = []
        webDriver = webdriver.Chrome(executable_path=self.EXECUTABLE_PATH)
        webDriver.get('https://www.google.com')
        elem = webDriver.find_element_by_name('q')
        elem.send_keys(keyWord)
        elem.send_keys(Keys.RETURN)
        elems1 = webDriver.find_elements_by_css_selector('.hdtb-imb')
        for elem1 in elems1:
            print(elem1.text)
            if 'news' in elem1.text:
                break
            if '뉴스' in elem1.text:
                break
        link1 = elem1.find_element_by_tag_name('a')
        linkAddress1 = link1.get_attribute('href')
        webDriver.get(linkAddress1)

        Items2 = webDriver.find_elements_by_css_selector('.DyOREb')
        for item2 in Items2:
            link2 = item2.find_element_by_tag_name('a')
            linkAddress1 = link2.get_attribute('href')
            pageInfo = self.GetLinkPageInfo(linkAddress1)
            #print(pageInfo["body"])
            message1 = {
                'address':linkAddress1,
                'title':pageInfo['title'],
                'body':pageInfo['body']
                }
            returnMessages.append(message1)
        #return returnMessages #------------------------

        elems3 = webDriver.find_elements_by_css_selector('.fl')
        for item3 in elems3:
            linkAddress2 = item3.get_attribute('href')
            if linkAddress2:
                Rsts1 = self.SearchEachPage(linkAddress2)
                for Rst1 in Rsts1:
                    returnMessages.append(Rst1)

        webDriver.close()
        return returnMessages

    def SearchEachPage(self, linkAddress):
        returnMessages = []
        webDriver = webdriver.Chrome(executable_path=self.EXECUTABLE_PATH)
        print("[GOOGLE-1]", linkAddress)
        webDriver.get(linkAddress)
        print(linkAddress)
        Items2 = webDriver.find_elements_by_css_selector('.DyOREb')
        for item2 in Items2:
            link2 = item2.find_element_by_tag_name('a')
            linkAddress1 = link2.get_attribute('href')
            print("[ADDRESS]", linkAddress1)
            pageInfo = self.GetLinkPageInfo(linkAddress1)
            #print(pageInfo["body"])
            message1 = {
                'address':linkAddress1,
                'title':pageInfo['title'],
                'body':pageInfo['body']
                }
            returnMessages.append(message1)

        webDriver.close()
        return returnMessages
반응형
반응형
250x250
최근에 올라온 글
최근에 달린 댓글
Total
Today
Yesterday
링크
«   2025/01   »
1 2 3 4
5 6 7 8 9 10 11
12 13 14 15 16 17 18
19 20 21 22 23 24 25
26 27 28 29 30 31
글 보관함