169 lines
5.5 KiB
Python
169 lines
5.5 KiB
Python
import os
|
|
import datetime
|
|
import traceback
|
|
import pymysql.cursors
|
|
|
|
default_file_at = 'filedb'
|
|
|
|
|
|
def get_all_file_name(year, month):
|
|
return '{}/{:04d}_{:02d}_all'.format(default_file_at, year, month)
|
|
|
|
def get_month_data(year, month):
|
|
file_name = get_all_file_name(year, month)
|
|
if os.path.exists(file_name) == False:
|
|
return ""
|
|
f = open(file_name, mode='r', encoding='utf8')
|
|
return f.read()
|
|
|
|
def get_inner_string(str, prefix, postfix):
|
|
start_index = str.find(prefix)
|
|
if start_index != -1:
|
|
start_index += len(prefix)
|
|
end_index = str.find(postfix, start_index)
|
|
if end_index != -1:
|
|
return str[start_index:end_index].strip(), end_index+len(postfix)
|
|
return "", -1
|
|
|
|
def get_month_total_count(year, month):
|
|
file_name = get_all_file_name(year, month)
|
|
if os.path.exists(file_name) == False:
|
|
return 0
|
|
f = open(file_name, mode='r', encoding='utf8')
|
|
data = f.read()
|
|
inner, _ = get_inner_string(data, "<span class='pit'> ", '</span>')
|
|
return int(inner)
|
|
|
|
def parse_yymmdd(yymmdd):
|
|
try:
|
|
date_obj = datetime.datetime.strptime(yymmdd, "%Y-%m-%d")
|
|
return date_obj.strftime("%Y-%m-%d")
|
|
except ValueError as e:
|
|
date_obj = datetime.datetime.strptime(yymmdd, "%Y-%m")
|
|
return date_obj.strftime("%Y-%m-00")
|
|
except Exception as e:
|
|
return ''
|
|
|
|
class Article:
|
|
def __init__(self, date, title_id, title, photo, desc_id, desc):
|
|
self.date = parse_yymmdd(date)
|
|
self.title_id = title_id
|
|
self.title = title
|
|
self.photo = photo
|
|
self.desc_id = desc_id
|
|
self.desc = desc
|
|
|
|
def __str__(self):
|
|
return 'date: {}, title_id: {}, title: {}, photo: {}, desc_id: {}'.format(self.date, self.title_id, self.title, self.photo, self.desc_id)
|
|
|
|
def __repr__(self):
|
|
return self.__str__() + '\n'
|
|
|
|
def get_add_sql(self):
|
|
sql = "insert into history_short(title_id, date, title, photo, desc_id)" \
|
|
" values ('{title_id}', '{date}', '{title}', '{photo}', '{desc_id}') on duplicate key" \
|
|
" update date = '{date}', title = '{title}', photo = '{photo}', desc_id = '{desc_id}';".format(
|
|
title_id = self.title_id,
|
|
date = self.date,
|
|
title = self.title,
|
|
photo = self.photo,
|
|
desc_id = self.desc_id
|
|
)
|
|
return sql
|
|
|
|
|
|
def get_photo_parsed(str):
|
|
next_index = 0
|
|
parsed = ""
|
|
while next_index != -1:
|
|
photo_id, next_index = get_inner_string(str, "goImageViewerContent('", "');")
|
|
if next_index == -1:
|
|
break
|
|
photo_desc, next_index = get_inner_string(str[next_index:], "\">", "</a>")
|
|
str = str[next_index:]
|
|
parsed += "{}\t{}\n".format(photo_id, photo_desc)
|
|
return parsed
|
|
|
|
def get_articles(str):
|
|
table_data, _ = get_inner_string(str, '<tbody>', '</tbody>')
|
|
next_index = 0
|
|
articles = []
|
|
while next_index != -1:
|
|
date, next_index = get_inner_string(table_data, '<td>', '</td>')
|
|
if next_index == -1:
|
|
break
|
|
table_data = table_data[next_index:]
|
|
title_href, next_index = get_inner_string(table_data, '<td class="c_pitb alg_l">', '</td>')
|
|
table_data = table_data[next_index:]
|
|
photo, next_index = get_inner_string(table_data, '<td>', '</td>')
|
|
table_data = table_data[next_index:]
|
|
desc, next_index = get_inner_string(table_data, '<td>', '</td>')
|
|
table_data = table_data[next_index:]
|
|
title, _ = get_inner_string(title_href, ');">', '</a>')
|
|
title_id, _ = get_inner_string(title_href, ":goItemView('tcct', '", "', '');\">")
|
|
desc_id, _ = get_inner_string(desc, "onclick=\"goFront('", "', '")
|
|
photo_extracted = get_photo_parsed(photo)
|
|
article = Article(date, title_id, title, photo_extracted, desc_id, desc)
|
|
articles.append(article)
|
|
return articles
|
|
|
|
|
|
def db_connect():
|
|
conn = pymysql.connect(
|
|
host = "ec2-52-79-235-219.ap-northeast-2.compute.amazonaws.com",
|
|
port = 33066,
|
|
user = "rep_kr",
|
|
password = "repkrdkagh00",
|
|
db = "rep_kr",
|
|
charset = "utf8mb4"
|
|
)
|
|
return conn
|
|
|
|
def db_exec(conn, sql):
|
|
db_cur = conn.cursor()
|
|
db_cur.execute(sql)
|
|
print(sql)
|
|
|
|
def db_add_articles(year, month):
|
|
data = get_month_data(year, month)
|
|
articles = get_articles(data)
|
|
total_count = get_month_total_count(year, month)
|
|
if len(articles) == 0:
|
|
print("{}-{} No articles", year, month)
|
|
return
|
|
# start to db
|
|
sql = ""
|
|
try:
|
|
for idx, article in enumerate(articles):
|
|
sql = article.get_add_sql()
|
|
db_exec(conn, sql)
|
|
conn.commit()
|
|
except Exception as e:
|
|
print(traceback.format_exc())
|
|
print(sql)
|
|
# write logs
|
|
#print(articles)
|
|
print("{}-{} articles {}/{}".format(year, month, len(articles), total_count))
|
|
|
|
def exam():
|
|
year = 1948
|
|
month = 8
|
|
for year in range(1945, 2009):
|
|
for month in range(1, 13):
|
|
db_add_articles(year, month)
|
|
|
|
|
|
conn = db_connect()
|
|
exam()
|
|
conn.close()
|
|
|
|
|
|
"""
|
|
<a href="javascript:;" onclick="goFront('tcct_1951_11_26_0030', '설명', 'commentary','');" class="gbtn"><span><em class="ibtn_left">설명</em></span></a>
|
|
|
|
function goFront(levelId, frontTitle, elName, elType) {
|
|
if(!elType)elType = frontTitle;
|
|
window.open('/item/front.do?levelId='+levelId +'&frontTitle=' + encodeURI(frontTitle) +'&elName=' + elName + '&elType='+encodeURI(elType), '', 'toolbar=no,location=no,directories=no,status=no,menubar=no,scrollbars=yes,resizable=no,top=100,left=250,width=815,height=500');
|
|
}
|
|
"""
|