Files
rep-kr-history-db/load_db.py

169 lines
5.5 KiB
Python
Raw Normal View History

import os
import datetime
import traceback
import pymysql.cursors
default_file_at = 'filedb'
def get_all_file_name(year, month):
return '{}/{:04d}_{:02d}_all'.format(default_file_at, year, month)
def get_month_data(year, month):
file_name = get_all_file_name(year, month)
if os.path.exists(file_name) == False:
return ""
f = open(file_name, mode='r', encoding='utf8')
return f.read()
def get_inner_string(str, prefix, postfix):
start_index = str.find(prefix)
if start_index != -1:
start_index += len(prefix)
end_index = str.find(postfix, start_index)
if end_index != -1:
return str[start_index:end_index].strip(), end_index+len(postfix)
return "", -1
def get_month_total_count(year, month):
file_name = get_all_file_name(year, month)
if os.path.exists(file_name) == False:
return 0
f = open(file_name, mode='r', encoding='utf8')
data = f.read()
inner, _ = get_inner_string(data, "<span class='pit'> ", '</span>')
return int(inner)
def parse_yymmdd(yymmdd):
try:
date_obj = datetime.datetime.strptime(yymmdd, "%Y-%m-%d")
return date_obj.strftime("%Y-%m-%d")
except ValueError as e:
date_obj = datetime.datetime.strptime(yymmdd, "%Y-%m")
return date_obj.strftime("%Y-%m-00")
except Exception as e:
return ''
class Article:
def __init__(self, date, title_id, title, photo, desc_id, desc):
self.date = parse_yymmdd(date)
self.title_id = title_id
self.title = title
self.photo = photo
self.desc_id = desc_id
self.desc = desc
def __str__(self):
return 'date: {}, title_id: {}, title: {}, photo: {}, desc_id: {}'.format(self.date, self.title_id, self.title, self.photo, self.desc_id)
def __repr__(self):
return self.__str__() + '\n'
def get_add_sql(self):
sql = "insert into history_short(title_id, date, title, photo, desc_id)" \
" values ('{title_id}', '{date}', '{title}', '{photo}', '{desc_id}') on duplicate key" \
" update date = '{date}', title = '{title}', photo = '{photo}', desc_id = '{desc_id}';".format(
title_id = self.title_id,
date = self.date,
title = self.title,
photo = self.photo,
desc_id = self.desc_id
)
return sql
def get_photo_parsed(str):
next_index = 0
parsed = ""
while next_index != -1:
photo_id, next_index = get_inner_string(str, "goImageViewerContent('", "');")
if next_index == -1:
break
photo_desc, next_index = get_inner_string(str[next_index:], "\">", "</a>")
str = str[next_index:]
parsed += "{}\t{}\n".format(photo_id, photo_desc)
return parsed
def get_articles(str):
table_data, _ = get_inner_string(str, '<tbody>', '</tbody>')
next_index = 0
articles = []
while next_index != -1:
date, next_index = get_inner_string(table_data, '<td>', '</td>')
if next_index == -1:
break
table_data = table_data[next_index:]
title_href, next_index = get_inner_string(table_data, '<td class="c_pitb alg_l">', '</td>')
table_data = table_data[next_index:]
photo, next_index = get_inner_string(table_data, '<td>', '</td>')
table_data = table_data[next_index:]
desc, next_index = get_inner_string(table_data, '<td>', '</td>')
table_data = table_data[next_index:]
title, _ = get_inner_string(title_href, ');">', '</a>')
title_id, _ = get_inner_string(title_href, ":goItemView('tcct', '", "', '');\">")
desc_id, _ = get_inner_string(desc, "onclick=\"goFront('", "', '")
photo_extracted = get_photo_parsed(photo)
article = Article(date, title_id, title, photo_extracted, desc_id, desc)
articles.append(article)
return articles
def db_connect():
conn = pymysql.connect(
host = "ec2-52-79-235-219.ap-northeast-2.compute.amazonaws.com",
port = 33066,
user = "rep_kr",
password = "repkrdkagh00",
db = "rep_kr",
charset = "utf8mb4"
)
return conn
def db_exec(conn, sql):
db_cur = conn.cursor()
db_cur.execute(sql)
print(sql)
def db_add_articles(year, month):
data = get_month_data(year, month)
articles = get_articles(data)
total_count = get_month_total_count(year, month)
if len(articles) == 0:
print("{}-{} No articles", year, month)
return
# start to db
sql = ""
try:
for idx, article in enumerate(articles):
sql = article.get_add_sql()
db_exec(conn, sql)
conn.commit()
except Exception as e:
print(traceback.format_exc())
print(sql)
# write logs
#print(articles)
print("{}-{} articles {}/{}".format(year, month, len(articles), total_count))
def exam():
year = 1948
month = 8
for year in range(1945, 2009):
for month in range(1, 13):
db_add_articles(year, month)
conn = db_connect()
exam()
conn.close()
"""
<a href="javascript:;" onclick="goFront('tcct_1951_11_26_0030', '설명', 'commentary','');" class="gbtn"><span><em class="ibtn_left">설명</em></span></a>
function goFront(levelId, frontTitle, elName, elType) {
if(!elType)elType = frontTitle;
window.open('/item/front.do?levelId='+levelId +'&frontTitle=' + encodeURI(frontTitle) +'&elName=' + elName + '&elType='+encodeURI(elType), '', 'toolbar=no,location=no,directories=no,status=no,menubar=no,scrollbars=yes,resizable=no,top=100,left=250,width=815,height=500');
}
"""