SWDesk
[Python] Collecting Company Data
inhae
2023. 1. 19. 09:07
SMInfo로부터 기업 데이터를 수집하기 위한 파이썬 소스코드
sminfo1 = cSMInfo(False)
excelIN = "./HiddenChampionC_230116.xlsx"
sheetIN = "HiddenChampionD"
sheetOUT = "HiddenChampionC"
excelOUT = "./HiddenChampionData_SMInfo230117.xlsx"
company1DF = bExcel.ReadFile(excelIN, sheetIN)
if company1DF.empty:
print("[???]", "None of INPUT Data")
company9DF = bExcel.ReadFile(excelOUT, sheetOUT)
companies1 = company1DF.to_dict('records')
companyNumber = len(companies1)
preTime1 = bTime.CalDateTime()
companyCount1 = 0
companyCount11 = 0
for companiy1 in companies1:
companyCount11 += 1
#if companyCount11<250: continue;
try:
companyName11 = companiy1.get('CompanyName1')
ceoName11 = companiy1.get('CEO1')
if (not companyName11) or (not ceoName11):
company9DF = pd.concat([company9DF, DataFrame([companiy1])], ignore_index=True, axis=0)
continue
except: continue;
pattern = r'\([^)]*\)'
companyName11 = re.sub(pattern=pattern, repl='', string=companyName11)
#companyName11 = companyName11.strip(r'\([^)]*\)')
companyName11 = companyName11.replace("주식회사", "")
companyName11 = companyName11.strip(" ")
companyName1 = companyName11.strip()
if ceoName11:
ceoName1 = ceoName11.split(",")[0]
companyInfo1 = {
'CompanyName': companyName1,
'CEO': ceoName1,
'Query': companyName1 + " " + ceoName1
}
print(companyInfo1)
index91 = ceoName1.find(companyName1)
if index91>=0: continue;
if not company9DF.empty:
preDatai11 = company9DF[(company9DF['CompanyName']==companyName1) & (company9DF['CEO']==ceoName1)].index.to_list()
if preDatai11:
continue;
else:
companyInfo1 = {
'CompanyName': companyName1,
'Query': companyName1
}
print(companyInfo1)
if not company9DF.empty:
preDatai11 = company9DF[(company9DF['CompanyName']==companyName1) & (True)].index.to_list()
if preDatai11:
continue;
webDriver = sminfo1.Login()
try:
webDriver = sminfo1.SetSearchCriteria(webDriver, companyInfo1)
companyInfo1DF = sminfo1.CollectCompanyInfos(webDriver)
except:
companyInfo1DF = DataFrame()
if companyInfo1DF.empty:
companyInfo1['OriginalData'] = None
companyInfo1['I_Date'] = bTime.GetTimeString()
companyInfo1['Remarks'] = "None of Company Info"
companyInfo1DF = DataFrame([companyInfo1])
company9DF = pd.concat([company9DF, companyInfo1DF], ignore_index=True, axis=0)
webDriver.quit()
continue;
companyCount2 = 0;
companyInfos2 = companyInfo1DF.to_dict('records')
count22 = 0
for companyInfo2 in companyInfos2:
count22 += 1
if count22>10: break;
originalData = companyInfo2.get('OriginalData')
if not originalData:
companyInfo2['I_Date'] = bTime.GetTimeString()
companyInfo2['Remarks'] = "None of Company Info"
company9DF = pd.concat([company9DF, DataFrame([companyInfo2])], ignore_index=True, axis=0)
companyCount2 += 1
webDriver.quit()
continue
companyInfo21 = json.loads(originalData)
companyType2 = companyInfo21.get("기업형태")
if companyType2=="개인사업자":
count22 -= 1
#continue;
while True:
timeDiff2 = bTime.GetTimeDifference(bTime.GetTimeString(preTime1))
secDiff2 = timeDiff2['Second']
if secDiff2>20:
preTime1 = bTime.CalDateTime()
break
if companyCount2>0:
webDriver = sminfo1.Login()
webDriver = sminfo1.SetSearchCriteria(webDriver, companyInfo1)
res21 = sminfo1.SelectTheCompany(webDriver, companyInfo21)
if res21:
try:
res22 = sminfo1.ExtractDetail(webDriver)
companyInfo2.update(res22)
except:
companyInfo2['Remarks'] = "Prohibited Company Info"
else:
companyInfo2['Remarks'] = "Invalid Company Info"
companyInfo2['CompanyType'] = companyType2
companyInfo2['I_Date'] = bTime.GetTimeString()
company9DF = pd.concat([company9DF, DataFrame([companyInfo2])], ignore_index=True, axis=0)
companyCount2 += 1
webDriver.quit()
companyCount1 += 1
if companyCount1 >= 100:
companyCount1 = 0
print("[companyCount1] ", companyCount1)
bExcel.WriteFile(company9DF, excelOUT, sheetOUT)
bExcel.WriteFile(company9DF, excelOUT, sheetOUT)
print("End")
반응형