1. 감성분석엔진을 통해 분류된 전체 회사 데이터.
def weekly_all_classified_data(dict):
start_date,end_date=start_to_date(),end_to_date()
maximum,minimum=score_input()
for key,value in dict.items():
driver = webdriver.Chrome('C:\\Users\\LeeKwanHyeong\\chromedriver_win32\\chromedriver.exe',chrome_options=options)
driver.implicitly_wait(2)
driver.get(value)
# sort 선택창
elem = driver.find_element_by_xpath("//span[@class='DPvwYc']")
elem.click()
time.sleep(3)
pyautogui.press('up')
time.sleep(0.7)
pyautogui.press('up')
time.sleep(0.7)
pyautogui.press('enter')
while(True):
driver.execute_script("window.scrollTo([0],document.body.scrollHeight);")
time.sleep(0.5)
try:
element=driver.find_element_by_xpath('//div[@class="U26fgb O0WRkf oG5Srb C0oVfc n9lfJ"]')
if(element is not None):
element.click()
break
except Exception:
continue
html=driver.page_source
driver.quit()
bsObj=BeautifulSoup(html,'lxml')
div_reviews=bsObj.find_all("div",{"class":"d15Mdf bAhLNe"})
company_list,grade_list,date_list,content_list=[],[],[],[]
for div in div_reviews:
date_=div.find('span',{"class":"p2TkOb"}).get_text()
t=re.findall(r"\d*\.\d+|\d+",date_)
date='{0}-{1}-{2}'.format(t[0],t[1],t[2])
year, month, day=int(t[0]), int(t[1]), int(t[2])
dd=datetime(year,month,day)
if((dd-start_date>=timedelta(days=0)) and (end_date-dd>=timedelta(days=0))):
content=div.find('span',{'jsname':'bN97Pc'}).get_text()
content=content.replace("전체 리뷰",'')
content=re.sub('[^가-힣0-9a-zA-Z_!?@#%^&-=:;,\"\'<>\\s]','',content)
content.encode('utf-8')
grade=len(div.find_all('div',{'class':'vQHuPe bUWb7c'}))
percentage,word=call_GRU_predict.GRU_predict(content)
if(((percentage<maximum)and (percentage>minimum))) and (len(word)>6):
date_list.append(dd)
content_list.append(content)
grade_list.append(grade)
company_list.append(key)
else:
continue
grade_Series=pd.Series(grade_list)
date_Series=pd.Series(date_list)
content_Series=pd.Series(content_list)
#
company_Series=pd.Series(company_list)
data_frame=pd.DataFrame()
data_frame['company']=company_Series
data_frame['date']=date_Series
data_frame['grade']=grade_Series
data_frame['content']=content_Series
good_data=data_frame[data_frame['grade']>2]
bad_data=data_frame[data_frame['grade']<3]
writer=pd.ExcelWriter("C:/data/thisweek_classified.xlsx")
if not os.path.exists(writer):
good_data.to_excel(writer, sheet_name='good',header=True)
bad_data.to_excel(writer,sheet_name='bad',header=True)
else:
bad_data.to_excel(writer, sheet_name='bad',header=False)
good_data.to_excel(writer,sheet_name='good',header=False)
writer.save()
if not os.path.exists('C:\\Data\\thisweek_classified.csv'):
data_frame.to_csv('C:\\Data\\thisweek_classified.csv',index=False, mode='w',encoding='utf_8_sig')
else:
data_frame.to_csv('C:\\Data\\thisweek_classified.csv',index=False,mode='a',encoding='utf_8_sig',header=False)
return data_frame
#location='C:\\Data\\thisweek_classified.csv'
def classified_all_data():
url_dict=call_url.call_url() #url리턴 함수
#시작일 종료일 지정
load=call_all_company.weekly_all_classified_data(url_dict)
return load
2. 감성분석 엔진을 통해 분류되지 않은 전체 데이터
def weekly_all_unclassified_data(dict):
start_date,end_date=start_to_date(),end_to_date()
for key,value in dict.items():
driver = webdriver.Chrome('C:\\Users\\LeeKwanHyeong\\chromedriver_win32\\chromedriver.exe',chrome_options=options)
driver.implicitly_wait(2)
driver.get(value)
# sort 선택창
elem = driver.find_element_by_xpath("//span[@class='DPvwYc']")
elem.click()
time.sleep(3)
pyautogui.press('up')
time.sleep(0.7)
pyautogui.press('up')
time.sleep(0.7)
pyautogui.press('enter')
while(True):
driver.execute_script("window.scrollTo([0],document.body.scrollHeight);")
time.sleep(0.5)
try:
element=driver.find_element_by_xpath('//div[@class="U26fgb O0WRkf oG5Srb C0oVfc n9lfJ"]')
if(element is not None):
element.click()
break
except Exception:
continue
html=driver.page_source
driver.quit()
bsObj=BeautifulSoup(html,'lxml')
div_reviews=bsObj.find_all("div",{"class":"d15Mdf bAhLNe"})
company_list,grade_list,date_list,content_list=[],[],[],[]
for div in div_reviews:
date_=div.find('span',{"class":"p2TkOb"}).get_text()
t=re.findall(r"\d*\.\d+|\d+",date_)
date='{0}-{1}-{2}'.format(t[0],t[1],t[2])
year, month, day=int(t[0]), int(t[1]), int(t[2])
dd=datetime(year,month,day)
if((dd-start_date>=timedelta(days=0)) and (end_date-dd>=timedelta(days=0))):
content=div.find('span',{'jsname':'bN97Pc'}).get_text()
content=content.replace("전체 리뷰",'')
content=re.sub('[^가-힣0-9a-zA-Z_!?@#%^&-=:;,\"\'<>\\s]','',content)
content.encode('utf-8')
grade=len(div.find_all('div',{'class':'vQHuPe bUWb7c'}))
percentage,word=call_GRU_predict.GRU_predict(content)
date_list.append(dd)
content_list.append(content)
grade_list.append(grade)
company_list.append(key)
grade_Series=pd.Series(grade_list)
date_Series=pd.Series(date_list)
content_Series=pd.Series(content_list)
#
company_Series=pd.Series(company_list)
data_frame=pd.DataFrame()
data_frame['company']=company_Series
data_frame['date']=date_Series
data_frame['grade']=grade_Series
data_frame['content']=content_Series
good_data=data_frame[data_frame['grade']>2]
bad_data=data_frame[data_frame['grade']<3]
writer=pd.ExcelWriter("C:/data/thisweek_unclassified.xlsx")
if not os.path.exists(writer):
good_data.to_excel(writer,sheet_name='good',header=True)
bad_data.to_excel(writer,sheet_name='bad',header=True)
else:
bad_data.to_excel(writer,sheet_name='bad',header=False)
good_data.to_excel(writer,sheet_name='good',header=False)
writer.save()
if not os.path.exists('C:\\Data\\thisweek_unclassified.csv'):
data_frame.to_csv('C:\\Data\\thisweek_unclassified.csv',index=False, mode='w',encoding='utf_8_sig')
else:
data_frame.to_csv('C:\\Data\\thisweek_unclassified.csv',index=False,mode='a',encoding='utf_8_sig',header=False)
#all_data=pd.read_csv("C:\\Data\\thisweek_unclassified.csv")
return data_frame
def none_classified_all_data():
url_dict=call_url.call_url()
load=call_all_company.weekly_all_unclassified_data(url_dict)
return load
3. 감성분류를 통해 분류된 선택된 회사 데이터
def weekly_selected_classified_data(url,company):
start_date,end_date=start_to_date(),end_to_date()
maximum,minimum=score_input()
driver = webdriver.Chrome('C:\\Users\\LeeKwanHyeong\\chromedriver_win32\\chromedriver.exe',chrome_options=options)
driver.implicitly_wait(2)
driver.get(url)
# sort 선택창
elem = driver.find_element_by_xpath("//span[@class='DPvwYc']")
elem.click()
time.sleep(3)
pyautogui.press('up')
time.sleep(0.7)
pyautogui.press('up')
time.sleep(0.7)
pyautogui.press('enter')
while(True):
driver.execute_script("window.scrollTo([0],document.body.scrollHeight);")
time.sleep(0.5)
try:
element=driver.find_element_by_xpath('//div[@class="U26fgb O0WRkf oG5Srb C0oVfc n9lfJ"]')
if(element is not None):
element.click()
break
except Exception:
continue
html=driver.page_source
driver.quit()
bsObj=BeautifulSoup(html,'lxml')
div_reviews=bsObj.find_all("div",{"class":"d15Mdf bAhLNe"})
#
company_list,grade_list,date_list,content_list=[],[],[],[]
for div in div_reviews:
date_=div.find('span',{"class":"p2TkOb"}).get_text()
t=re.findall(r"\d*\.\d+|\d+",date_)
date='{0}-{1}-{2}'.format(t[0],t[1],t[2])
year, month, day=int(t[0]), int(t[1]), int(t[2])
dd=datetime(year,month,day)
if((dd-start_date>=timedelta(days=0)) and (end_date-dd>=timedelta(days=0))):
content=div.find('span',{'jsname':'bN97Pc'}).get_text()
content=content.replace("전체 리뷰",'')
content=re.sub('[^가-힣0-9a-zA-Z_!?@#%^&-=:;,\"\'<>\\s]','',content)
content.encode('utf-8')
grade=len(div.find_all('div',{'class':'vQHuPe bUWb7c'}))
percentage,word=call_GRU_predict.GRU_predict(content)
if(((percentage<maximum)and (percentage>minimum))) and (len(word)>6):
date_list.append(dd)
content_list.append(content)
grade_list.append(grade)
company_list.append(company)
else:
continue
grade_Series=pd.Series(grade_list)
date_Series=pd.Series(date_list)
content_Series=pd.Series(content_list)
#
company_Series=pd.Series(company_list)
data_frame=pd.DataFrame()
data_frame['company']=company_Series
data_frame['date']=date_Series
data_frame['grade']=grade_Series
data_frame['content']=content_Series
#
good_data=data_frame[data_frame['grade']>2]
bad_data=data_frame[data_frame['grade']<3]
if not os.path.exists('C:\\Data\\thisweek_classified.csv'):
data_frame.to_csv('C:\\Data\\thisweek_selected_classified.csv',index=False, mode='w',encoding='utf_8_sig')
else:
data_frame.to_csv('C:\\Data\\thisweek_selected_classified.csv',index=False,mode='a',encoding='utf_8_sig',header=False)
writer=pd.ExcelWriter('C:/data/thisweek_selected_classified.xlsx')
if not os.path.exists(writer):
good_data.to_excel(writer,sheet_name='good',header=True)
bad_data.to_excel(writer,sheet_name='bad',header=True)
else:
good_data.to_excel(writer,sheet_name='good',header=False)
bad_data.to_excel(writer,sheet_name='bad',header=False)
writer.save()
return data_frame
def classified_selected_data():
url,company=call_url.select_url()
load=call_all_company.weekly_selected_classified_data(url,company)
return load
4. 감성분류를 통해 분류되지 않은 선택된 회사 데이터
def weekly_selected_unclassified_data(url,company):
start_date,end_date=start_to_date(),end_to_date()
driver = webdriver.Chrome('C:\\Users\\LeeKwanHyeong\\chromedriver_win32\\chromedriver.exe',chrome_options=options)
driver.implicitly_wait(2)
driver.get(url)
# sort 선택창
elem = driver.find_element_by_xpath("//span[@class='DPvwYc']")
elem.click()
time.sleep(3)
pyautogui.press('up')
time.sleep(0.7)
pyautogui.press('up')
time.sleep(0.7)
pyautogui.press('enter')
while(True):
driver.execute_script("window.scrollTo([0],document.body.scrollHeight);")
time.sleep(0.5)
try:
element=driver.find_element_by_xpath('//div[@class="U26fgb O0WRkf oG5Srb C0oVfc n9lfJ"]')
if(element is not None):
element.click()
break
except Exception:
continue
html=driver.page_source
driver.quit()
bsObj=BeautifulSoup(html,'lxml')
div_reviews=bsObj.find_all("div",{"class":"d15Mdf bAhLNe"})
#
company_list,grade_list,date_list,content_list=[],[],[],[]
for div in div_reviews:
date_=div.find('span',{"class":"p2TkOb"}).get_text()
t=re.findall(r"\d*\.\d+|\d+",date_)
date='{0}-{1}-{2}'.format(t[0],t[1],t[2])
year, month, day=int(t[0]), int(t[1]), int(t[2])
dd=datetime(year,month,day)
if((dd-start_date>=timedelta(days=0)) and (end_date-dd>=timedelta(days=0))):
content=div.find('span',{'jsname':'bN97Pc'}).get_text()
content=content.replace("전체 리뷰",'')
content=re.sub('[^가-힣0-9a-zA-Z_!?@#%^&-=:;,\"\'<>\\s]','',content)
content.encode('utf-8')
grade=len(div.find_all('div',{'class':'vQHuPe bUWb7c'}))
percentage,word=call_GRU_predict.GRU_predict(content)
date_list.append(dd)
content_list.append(content)
grade_list.append(grade)
company_list.append(company)
grade_Series=pd.Series(grade_list)
date_Series=pd.Series(date_list)
content_Series=pd.Series(content_list)
#
company_Series=pd.Series(company_list)
data_frame=pd.DataFrame()
data_frame['company']=company_Series
data_frame['date']=date_Series
data_frame['grade']=grade_Series
data_frame['content']=content_Series
#
good_data=data_frame[data_frame['grade']>2]
bad_data=data_frame[data_frame['grade']>3]
writer=pd.ExcelWriter("C:/data/thisweek_selected_unclassified.xlsx")
if not os.path.exists(writer):
good_data.to_excel(writer,sheet_name='good',header=True)
bad_data.to_excel(writer,sheet_name='bad',header=True)
else:
good_data.to_excel(writer,sheet_name='good',header=False)
bad_data.to_excel(writer,sheet_name='bad',header=False)
writer.save()
if not os.path.exists('C:\\Data\\thisweek_selected_unclassified.csv'):
data_frame.to_csv('C:\\Data\\thisweek_selected_unclassified.csv',index=False, mode='w',encoding='utf_8_sig')
else:
data_frame.to_csv('C:\\Data\\thisweek_selected_unclassified.csv',index=False,mode='a',encoding='utf_8_sig',header=False)
return data_frame
def none_classified_selected_data():
url,company=call_url.select_url()
load=call_all_campany.weekly_selected_classified_data(url,company)
return load
'Projects' 카테고리의 다른 글
Intern Project 3. 메인 기능 함수 만들기(2) (0) | 2021.01.13 |
---|---|
Intern Project 2. 사전 준비 함수(3) (0) | 2021.01.13 |
Intern Project 2. 사전 준비 함수(2) (0) | 2021.01.13 |
Intern Project 2. 사전 준비 함수(1) (0) | 2021.01.13 |
Intern Project1. GRU 기법을 활용한 감성 분석 핵심 엔진 만들기 (0) | 2021.01.13 |