[Python] Collecting Infos from Webpages

티스토리 뷰

SWDesk

[Python] Collecting Infos from Webpages

inhae 2022. 12. 4. 14:54

Filename : AETronix22B.py

Functions

Collect images
Set page-no
Collect product infos
Collect categegory infos

[Collect Images]

    def CollectImages(self, sheetName=None, folderName=None):
        fileName = "file10.xlsx"
        folderPath = "./Images"
        if not os.path.exists(folderPath):
            os.mkdir(folderPath)
        if not sheetName: sheetName = "LiFt"
        if not folderName: folderName = "LightingFittings"
        folderPathName = folderPath + "/" + folderName
        if not os.path.exists(folderPathName):
            os.mkdir(folderPathName)
        goodsInfoDF = bExcel.ReadFile(fileName, sheetName)
        if goodsInfoDF.empty: return;
        goodsInfos = goodsInfoDF.to_dict('records')
        imageURLs = []
        for goodsInfo in goodsInfos:
            imageURL = goodsInfo.get('ImageURL')
            if not imageURL: continue;
            if (not imageURLs) and (imageURL in imageURLs): continue;
            index11 = imageURL.rfind("/")
            index12 = imageURL.rfind(".")
            imageName = imageURL[(index11+1):]
            imagePathName = folderPathName + "/" + imageName
            if os.path.exists(imagePathName): continue
            imageURLs.append(imageURL)
            urllib.request.urlretrieve(imageURL, imagePathName)

[Select PageNo.]

    def SetPage(self, webDriver, pageTagInfo, pageNo):
        targetTag11 = "AE43"
        elem11s = pageTagInfo.GetTargetElem(webDriver, targetTag11)
        tag_a = {
            'TagType': "html",
            'TagValue': "a"
        }
        for elem11 in elem11s:
            pageNo11 = self.WebControl.GetTextinElement(elem11)
            if not pageNo11: continue;
            try:
                if pageNo==int(pageNo11):
                    self.WebControl.TreatClick(webDriver, elem11, 10)
                    return pageNo
            except: continue;
        return None

[Collect Product Infos]

    def GetProductInfo(self, pageURL):
        webDriver = self.WebControl.GetWebDriver()
        webDriver = self.WebControl.AccessPage(webDriver, pageURL, 5)
        pageTagInfo = cPageTagInfo(self.SheetName_TagInfo)
        tag_title = "AE28"
        tag_price = "AE32"
        tag_ids = "AE30"
        tag_values = "AE31"
        tag_alert = "AE33"
        tag_image = "AE25"
        elem_image = pageTagInfo.GetTargetElem(webDriver, tag_image)
        imageURL = elem_image.get_attribute('src')
        #print("[ImageURL]", imageURL)
        elem_title = pageTagInfo.GetTargetElem(webDriver, tag_title)
        elem_price = pageTagInfo.GetTargetElem(webDriver, tag_price)
        elem_ids = pageTagInfo.GetTargetElem(webDriver, tag_ids)
        elem_values = pageTagInfo.GetTargetElem(webDriver, tag_values)
        title1 = self.WebControl.GetTextinElement(elem_title)
        rst1 = {
            'Title': title1,
            'ImageURL': imageURL
        }
        price1 = self.WebControl.GetTextinElement(elem_price)
        if not price1: rst1['Price'] = price1
        try:
            elem_alert= pageTagInfo.GetTargetElem(webDriver, tag_alert)
            alert1 = self.WebControl.GetTextinElement(elem_alert)
            if alert1: rst1['Alert'] = alert1
        except:
            pass
        index1 = 0
        for elem_id in elem_ids:
            elemName = self.WebControl.GetTextinElement(elem_id)
            elemValue = self.WebControl.GetTextinElement(elem_values[index1])
            if not elemName: rst1['Remarks'] = elemValue
            else: rst1[elemName] = elemValue
            index1 += 1
        #print("[Rst]", rst1)
        webDriver.quit()
        return rst1

[Collect Category Infos]

    def GetMidCatData(self, midCatInfo):
        self.SubCatNames = []
        self.SubCatInfos = {}
        self.ProductInfoDF = DataFrame()
        url11 = midCatInfo.get('CatURL')
        sheetName = midCatInfo.get('SheetName')
        midCatName = midCatInfo.get('MidCatName')
        tag_a = {
            'TagType': "html",
            'TagValue': "a"
        }
        webDriver = self.WebControl.GetWebDriver(False)
        webDriver =  self.WebControl.AccessPage(webDriver, url11, 10)
        self.PageTagInfo.ClearPageElems()
        targetTag11 = "AE10"
        elem11s = self.PageTagInfo.GetTargetElem(webDriver, targetTag11)
        searchNext = False
        for elem11 in elem11s:
            elem11a = self.WebControl.GetElement(elem11, tag_a)
            subCatLink = elem11a.get_attribute('href')
            subCatName = self.WebControl.GetTextinElement(elem11a)
            if not subCatName: continue;
            print(subCatName, " : ", subCatLink)
            subCatInfo = {
                subCatName: subCatLink
            }
            productNumber = self.GetProductInfos(subCatLink, subCatName)
            if self.Is4ProductInfo: bExcel.WriteFile(self.ProductInfoDF, "./File16.xlsx", sheetName)
            midCatInfo['SubCatName'] = subCatName
            if productNumber: midCatInfo['ProductNumber'] = productNumber
            else: midCatInfo['ProductNumber'] = 0
            print(midCatInfo)
            self.MidCatInfoDF = self.MidCatInfoDF.append(midCatInfo, ignore_index=True)
        webDriver.quit()

'SWDesk' 카테고리의 다른 글

[Python] 사진에서 사람의 눈과 코, 입 등 특징점을 추출하는 코드 (0)	2023.03.05
[Python] Collecting Company Data (0)	2023.01.19
[Python] Compare 'Dict' (0)	2022.10.11
[Python] DataFrame과 dict 합치기 (0)	2022.10.10
[Python] TTS(text to sound) Example (0)	2022.10.03

250x250

공지사항

최근에 올라온 글

최근에 달린 댓글

Total

Today

Yesterday

링크

TAG more

« 2025/05 »
일	월	화	수	목	금	토
				1	2	3
4	5	6	7	8	9	10
11	12	13	14	15	16	17
18	19	20	21	22	23	24
25	26	27	28	29	30	31

글 보관함

Connecting HW and SW

티스토리 뷰

[Python] Collecting Infos from Webpages

'SWDesk' 카테고리의 다른 글

티스토리툴바