SWDesk
[Python] Collecting Infos from Webpages
inhae
2022. 12. 4. 14:54
Filename : AETronix22B.py
Functions
- Collect images
- Set page-no
- Collect product infos
- Collect categegory infos
[Collect Images]
def CollectImages(self, sheetName=None, folderName=None):
fileName = "file10.xlsx"
folderPath = "./Images"
if not os.path.exists(folderPath):
os.mkdir(folderPath)
if not sheetName: sheetName = "LiFt"
if not folderName: folderName = "LightingFittings"
folderPathName = folderPath + "/" + folderName
if not os.path.exists(folderPathName):
os.mkdir(folderPathName)
goodsInfoDF = bExcel.ReadFile(fileName, sheetName)
if goodsInfoDF.empty: return;
goodsInfos = goodsInfoDF.to_dict('records')
imageURLs = []
for goodsInfo in goodsInfos:
imageURL = goodsInfo.get('ImageURL')
if not imageURL: continue;
if (not imageURLs) and (imageURL in imageURLs): continue;
index11 = imageURL.rfind("/")
index12 = imageURL.rfind(".")
imageName = imageURL[(index11+1):]
imagePathName = folderPathName + "/" + imageName
if os.path.exists(imagePathName): continue
imageURLs.append(imageURL)
urllib.request.urlretrieve(imageURL, imagePathName)
[Select PageNo.]
def SetPage(self, webDriver, pageTagInfo, pageNo):
targetTag11 = "AE43"
elem11s = pageTagInfo.GetTargetElem(webDriver, targetTag11)
tag_a = {
'TagType': "html",
'TagValue': "a"
}
for elem11 in elem11s:
pageNo11 = self.WebControl.GetTextinElement(elem11)
if not pageNo11: continue;
try:
if pageNo==int(pageNo11):
self.WebControl.TreatClick(webDriver, elem11, 10)
return pageNo
except: continue;
return None
[Collect Product Infos]
def GetProductInfo(self, pageURL):
webDriver = self.WebControl.GetWebDriver()
webDriver = self.WebControl.AccessPage(webDriver, pageURL, 5)
pageTagInfo = cPageTagInfo(self.SheetName_TagInfo)
tag_title = "AE28"
tag_price = "AE32"
tag_ids = "AE30"
tag_values = "AE31"
tag_alert = "AE33"
tag_image = "AE25"
elem_image = pageTagInfo.GetTargetElem(webDriver, tag_image)
imageURL = elem_image.get_attribute('src')
#print("[ImageURL]", imageURL)
elem_title = pageTagInfo.GetTargetElem(webDriver, tag_title)
elem_price = pageTagInfo.GetTargetElem(webDriver, tag_price)
elem_ids = pageTagInfo.GetTargetElem(webDriver, tag_ids)
elem_values = pageTagInfo.GetTargetElem(webDriver, tag_values)
title1 = self.WebControl.GetTextinElement(elem_title)
rst1 = {
'Title': title1,
'ImageURL': imageURL
}
price1 = self.WebControl.GetTextinElement(elem_price)
if not price1: rst1['Price'] = price1
try:
elem_alert= pageTagInfo.GetTargetElem(webDriver, tag_alert)
alert1 = self.WebControl.GetTextinElement(elem_alert)
if alert1: rst1['Alert'] = alert1
except:
pass
index1 = 0
for elem_id in elem_ids:
elemName = self.WebControl.GetTextinElement(elem_id)
elemValue = self.WebControl.GetTextinElement(elem_values[index1])
if not elemName: rst1['Remarks'] = elemValue
else: rst1[elemName] = elemValue
index1 += 1
#print("[Rst]", rst1)
webDriver.quit()
return rst1
[Collect Category Infos]
def GetMidCatData(self, midCatInfo):
self.SubCatNames = []
self.SubCatInfos = {}
self.ProductInfoDF = DataFrame()
url11 = midCatInfo.get('CatURL')
sheetName = midCatInfo.get('SheetName')
midCatName = midCatInfo.get('MidCatName')
tag_a = {
'TagType': "html",
'TagValue': "a"
}
webDriver = self.WebControl.GetWebDriver(False)
webDriver = self.WebControl.AccessPage(webDriver, url11, 10)
self.PageTagInfo.ClearPageElems()
targetTag11 = "AE10"
elem11s = self.PageTagInfo.GetTargetElem(webDriver, targetTag11)
searchNext = False
for elem11 in elem11s:
elem11a = self.WebControl.GetElement(elem11, tag_a)
subCatLink = elem11a.get_attribute('href')
subCatName = self.WebControl.GetTextinElement(elem11a)
if not subCatName: continue;
print(subCatName, " : ", subCatLink)
subCatInfo = {
subCatName: subCatLink
}
productNumber = self.GetProductInfos(subCatLink, subCatName)
if self.Is4ProductInfo: bExcel.WriteFile(self.ProductInfoDF, "./File16.xlsx", sheetName)
midCatInfo['SubCatName'] = subCatName
if productNumber: midCatInfo['ProductNumber'] = productNumber
else: midCatInfo['ProductNumber'] = 0
print(midCatInfo)
self.MidCatInfoDF = self.MidCatInfoDF.append(midCatInfo, ignore_index=True)
webDriver.quit()
반응형