Commit bf6b0f0f authored by 齐昊宇's avatar 齐昊宇

完成1-3标题解析入库

parents
<component name="ProjectCodeStyleConfiguration">
<state>
<option name="PREFERRED_PROJECT_CODE_STYLE" value="Default" />
</state>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="dataSourceStorageLocal">
<data-source name="postgres@39.104.109.13" uuid="16830a47-a264-4867-9152-47f09ec57145">
<database-info product="PostgreSQL" version="9.5.12" jdbc-version="4.2" driver-name="PostgreSQL JDBC Driver" driver-version="42.2.5" dbms="POSTGRES" exact-version="9.5.12" exact-driver-version="42.2">
<identifier-quote-string>&quot;</identifier-quote-string>
</database-info>
<case-sensitivity plain-identifiers="lower" quoted-identifiers="exact" />
<secret-storage>master_key</secret-storage>
<user-name>postgres</user-name>
<introspection-schemas>animal_new:@,public</introspection-schemas>
</data-source>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="DataSourceManagerImpl" format="xml" multifile-model="true">
<data-source source="LOCAL" name="postgres@39.104.109.13" uuid="16830a47-a264-4867-9152-47f09ec57145">
<driver-ref>postgresql</driver-ref>
<synchronize>true</synchronize>
<jdbc-driver>org.postgresql.Driver</jdbc-driver>
<jdbc-url>jdbc:postgresql://39.104.109.13:5432/postgres</jdbc-url>
</data-source>
</component>
</project>
\ No newline at end of file
This diff is collapsed.
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="PublishConfigData">
<serverData>
<paths name="root@39.104.86.100:22">
<serverdata>
<mappings>
<mapping local="$PROJECT_DIR$" web="/" />
</mappings>
</serverdata>
</paths>
</serverData>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="JavaScriptSettings">
<option name="languageLevel" value="ES6" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (python-project)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/oie-yearly-report.iml" filepath="$PROJECT_DIR$/.idea/oie-yearly-report.iml" />
</modules>
</component>
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="TestRunnerService">
<option name="PROJECT_TEST_RUNNER" value="Unittests" />
</component>
</module>
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
from postgre.utils import util as pysql
from yearly_report_analysis import year
summary_col_dict = {
'OIE-Listed disease': 'oie_listed_disease',
'occurrence': 'occurrence',
'Serotype(s)': 'serotype',
'New outbreaks': 'new_outbreaks',
'Total outbreaks': 'total_outbreaks',
'Species': 'species',
'Control Measures': 'control_measures',
'Official vaccination': 'official_vaccination',
'Measuring units': 'measuring_units',
'Susceptible': 'susceptible',
'Cases': 'cases',
'Deaths': 'deaths',
'Killed and disposed of': 'killed_and_disposed_of',
'Slaughtered': 'slaughtered',
'Vaccination in response to the outbreak(s)': 'vaccination_in_response_to_the_outbreak',
'id': 'disease_id'
}
select_length = pysql.exec('SELECT COUNT(*) FROM public.oie_yearlyreport_new')
length = select_length[0]
def sub_select(length, disease_dict):
global_start = 500
for i in range(global_start, length, 10):
count = 10
start = i
with open('ana.log', 'a', encoding='utf-8') as file:
file.write('正在处理' + str(i) + '-' + str(i+9) + ' 共' + str(length) + '条\n')
print('正在处理' + str(i) + '-' + str(i+10) + ' 共' + str(length) + '条')
select_res = pysql.selectData(('country_code', 'year', 'type', 'json_text', 'id'), 'oie_yearlyreport_new', False, (count, start))
k = 1
for j in select_res:
with open('ana.log', 'a', encoding='utf-8') as file:
file.write('处理第' + str(k) + '条中……\n')
print('处理第' + str(k) + '条中……')
a_html = year.analysis(j['json_text'])
result = handle_all_analysis(a_html, disease_dict, j['id'], j['country_code'], j['year'], j['type'])
if result:
print('第' + str(k) + '条处理完成')
file.write('第' + str(k) + '条处理完成\n')
else:
print('第' + str(k) + '条处理出现问题,已记录')
file.write('第' + str(k) + '条处理出现问题,已记录\n')
k = k + 1
def handle_all_analysis(a_html, disease_dict, year_report_id, country_code, year, type):
print(year_report_id)
if a_html == []:
with open('err.log', 'a', encoding='utf-8') as file:
file.writelines(str([a_html, year_report_id, country_code, year, type]) + '\n')
return False
table_1 = a_html[0]
table_3 = a_html[1]
table_1 = handle_a_table_1(table_1, [0, 1, 2], disease_dict)
for i in table_1:
tmp = {}
for key, value in i.items():
tmp[summary_col_dict[key]] = '...' if value == '' else value
tmp['year_report_id'] = year_report_id
tmp['country_code'] = country_code
tmp['year'] = year
tmp['type'] = type
if is_exist('oie_year_report_summary', ['oie_listed_disease'], tmp):
with open('ana.log', 'a', encoding='utf-8') as file:
file.write('已存在:' + str(tmp) + '\n')
print('已存在:' + str(tmp))
continue
else:
insert_into_summary('oie_year_report_summary', tmp)
table_3 = handle_a_table_3(table_3)
return True
def handle_1(json):
pass
def handle_a_table_1(table, auto_col, disease_dict):
auto_col_data = {}
ok_table = []
for tr in table:
if len(table[tr]) < 1:
continue
for td in table[tr]['td']:
if len(td) < 2:
continue
if td[0] != '':
auto_col_data = {}
for i in auto_col:
if td[i].strip() != '':
td[i] = td[i].replace(' (Domestic and Wild)', '')
td[i] = td[i].replace(' (Domestic)', '')
td[i] = td[i].replace(' (Wild)', '')
auto_col_data[i] = td[i]
else:
if i not in auto_col_data:
auto_col_data[i] = ''
td[i] = auto_col_data[i]
tmp_td = {}
j = 0
for i in table[tr]['th']:
tmp_td[i] = td[j]
j = j + 1
disease_name = tmp_td['OIE-Listed disease']
disease_name = disease_name.replace(' (Domestic and Wild)', '')
disease_name = disease_name.replace(' (Domestic)', '')
disease_name = disease_name.replace(' (Wild)', '')
if (disease_name not in disease_dict):
pysql.insertData({'oiednameen': disease_name}, 'oie_disease')
disease_dict = get_disease_list()
tmp_td['id'] = disease_dict[disease_name]
ok_table.append(tmp_td)
return ok_table
def handle_a_table_3(title):
ok_table = title
# for a_title in title:
# for td in title[a_title]:
# print(td)
return ok_table
# 获得所有疫病的list
def get_disease_list():
disease_list = pysql.selectData(['oiednameen', 'id'], 'oie_disease')
disease_detail = {}
for i in disease_list:
disease_detail[i['oiednameen']] = i['id']
return disease_detail
def insert_into_summary(table_name, data):
pysql.insertData(data, table_name)
def is_exist (table_name, col, data):
exsist = pysql.selectData(col, table_name, data)
if exsist:
return True
else:
return False
disease_dict = get_disease_list()
sub_select(length, disease_dict)
# 导入psycopg2包
import psycopg2
import threading
tableInfos = {
"oie_yearlyreport_new": {
"id": "int",
"country_id": "int",
"country_name": "string",
"json_text": "string",
"create_time": "date",
"update_time": "date",
"year": "string",
"type": "string",
"country_code": "string",
"url": "string",
"target_file": "string"
},
'oie_disease': {
'oiednameen': 'string'
},
'oie_year_report_summary': { # 1
'oie_listed_disease': 'string',
'occurrence': 'string',
'serotype': 'string',
'new_outbreaks': 'string',
'total_outbreaks': 'string',
'species': 'string',
'control_measures': 'string',
'official_vaccination': 'string',
'measuring_units': 'string',
'susceptible': 'string',
'cases': 'string',
'deaths': 'string',
'killed_and_disposed_of': 'string',
'slaughtered': 'string',
'vaccination_in_response_to_the_outbreak': 'string',
'disease_id': 'string',
'year_report_id': 'string',
'country_code': 'string',
'year': 'string',
'type': 'string'
},
"oie_year_report_never_report": { # 2
"animal": 'string',
"country_code": 'string',
"year": "int",
"type": "string",
"disease": "string",
"date_of_last_occurrence": 'string',
"species": "string",
"control_measures": "string",
"disease_id": "int",
"year_report_id": "int",
"official_vaccination": "string",
"id": "int"
},
"oie_year_report_detail": { # 3
'serotype': 'string',
'new_outbreaks': 'string',
'total_outbreaks': 'string',
'species': 'string',
'family_name': 'string',
'latin_name': 'string',
'measuring_units': 'string',
'susceptible': 'string',
'cases': 'string',
'deaths': 'string',
'killed_and_disposed_of': 'string',
'slaughtered': 'string',
'vaccination_in_response_to_the_outbreak': 'string',
'disease_id': 'int',
'disease': 'string',
'type': 'string',
'country_code': 'string',
'year': 'int',
'year_report_id': 'int',
'month': 'string',
'state': 'string'
}
}
class psqlUtil(object):
def open(self):
# if not hasattr(self, "conn"):
# 连接到一个给定的数据库
self.conn = psycopg2.connect(database="animal_new", user="postgres", password="postgres", host="39.104.109.13", port="5432")
# 建立游标,用来执行数据库操作
self.cursor = self.conn.cursor()
# print("__init__.py", self.conn)
def close(self):
# 关闭游标
self.cursor.close()
# 关闭数据库连接
self.conn.close()
def insertData(self, data, tableName):
self.open()
sql = "INSERT INTO public." + tableName + " ("
for key, info in data.items():
sql += key + ","
sql = sql[0:-1:1]
sql += ") values ("
for key, value in data.items():
sql += getValueFromInfo(key, value, tableName) + ","
sql = sql[0:-1:1]
sql += ")"
# print(sql)
self.cursor.execute(sql)
self.conn.commit()
def selectData(self, fields, tableName, where = False,limit = False):
self.open()
sql = "SELECT "
for f in fields:
sql += f + ","
sql = sql[:-1:1]
sql += " FROM public." + tableName
if where:
if len(where.items()) != 0:
sql += " WHERE "
for key, value in where.items():
sql += key + " = " + getValueFromInfo(key, value, tableName) + " AND "
sql = sql[0:-4:1]
if limit:
sql = sql + ' LIMIT ' + str(limit[0]) + ' OFFSET ' + str(limit[1])
self.cursor.execute(sql)
rows = self.cursor.fetchall()
data = []
for row in rows:
info = {}
for idx, value in enumerate(fields):
info[value] = row[idx]
data.append(info)
return data
def exec(self, sql):
self.open()
self.cursor.execute(sql)
rows = self.cursor.fetchone()
return rows
# 根据字段类型获取
def getValueFromInfo(key, value, tableName):
fieldsInfo = tableInfos[tableName]
typeInfo = fieldsInfo[key]
if(typeInfo == "int" or typeInfo == "float"):
return str(value)
if(typeInfo == "string" or typeInfo == "date"):
value = str(value).replace("'", "''")
return "'" + str(value) + "'"
util = psqlUtil()
This diff is collapsed.
This diff is collapsed.
from bs4 import BeautifulSoup
import re
class YearlyReportAnalysis():
html = ''
titles = [] # 所需要的title
no = [
'Summary on OIE-listed diseases/infections present in', # 1
'OIE-listed diseases absent in', # 2
'Detailed quantitative information for OIE-listed diseases/infections present in', # 3
'Unreported OIE-listed diseases during the reporting period', # 4
'Summary on non OIE-Listed diseases/infections present in', # 5
'Non OIE-Listed diseases absent in', # 6
'Detailed quantitative information for non OIE-Listed diseases/infections present in', # 7
'Unreported non OIE-Listed diseases', # 8
'Zoonotic diseases in humans', # 9
'Animal population', # 10
'Veterinarians and veterinary para-professionals', # 11,
'National reference laboratories', # 12
'Diagnostic Tests', # 13
'Vaccine Manufacturers', # 14
'Vaccines', # 15
'Vaccine production' # 16
]
bigTitle = [3, 7]
def __init__(self):
pass
def analysis(self, html):
self.html = html
self.init_analysis()
return self.analysis_1()
def init_analysis(self):
html = BeautifulSoup(self.html, features='html.parser')
self.find_class(html)
# 解析入口
def analysis_1(self):
result = []
for analysis_html in self.titles:
if analysis_html.name == 'td':
res = self.analysis_title_is_td(analysis_html)
else:
res = self.analysis_title_is_div(analysis_html)
result.append({str(analysis_html.text.strip()): res})
return result
def analysis_which_title(self, title):
j = 1
for i in self.no:
if title.text.find(i) != -1:
return j
j = j + 1
raise TypeError('Number will be None! Please check')
# 当表头是table时的处理函数
def analysis_title_is_td(self, html):
number = self.analysis_which_title(html)
tr = html.parent
table = tr.parent
th = table.find('th')
tr = th.parent
table = self.analysis_table(tr)
table['type'] = 'table'
table['no'] = number
return table
# 当表头是div的处理函数
def analysis_title_is_div(self, html):
tables = {}
number = self.analysis_which_title(html)
if number in self.bigTitle:
special = 3 # 大title的时候是3
tables['type'] = 'big'
else:
special = 2 # 小title的时候是2
tables['type'] = 'usual'
tables['no'] = number
tables['tables'] = []
tables_html = self.analysis_title_is_div_tables(html)
big_title = ''
for i in tables_html:
if callable(i.strip):
pass
else:
type = i.find(class_='TableFoyers2_thtitle').text.strip()
th = i.find('th')
if not th:
tr = i.find('tr').next_sibling.next_sibling
else:
tr = th.parent
table = self.analysis_table(tr)
if special == 2:
title = i.find(class_='TableFoyers2_thtitle')
title = title.contents[0].strip()
tables['tables'].append({title: table})
elif special == 3:
title = i.previous_sibling.previous_sibling
title = title.contents[0].strip()
title = title.replace(' (Domestic and Wild)', '')
title = title.replace(' (Domestic)', '')
title = title.replace(' (Wild)', '')
if title != '':
big_title = title
else:
title = big_title
tables['tables'].append({title: {'type': type, 'table': table}})
else:
tables['tables'].append(table)
return tables
# 表头是div的情况下,获取下面的所有table
def analysis_title_is_div_tables(self, html):
table = []
for i in html.next_siblings:
if callable(i.strip):
pass
elif i in self.titles:
break
else:
if i.find(class_='TableFoyers2_thtitle') in self.titles:
break
if i.name == 'table':
table.append(i)
return table
# 拆分表格
def analysis_table(self, th_html):
th = []
td = []
ths = th_html.find_all('th')
if ths == []:
ths = th_html.find_all('td')
tmp_th_td = []
for j in ths:
j = j.text.strip()
j = j.replace('\n', '')
j = re.sub('\s{2,}', ' ', j)
tmp_th_td.append(j)
td.append(tmp_th_td)
else:
for j in ths:
th.append(j.text.strip())
for i in th_html.next_siblings:
if callable(i.strip):
pass
elif i in self.titles:
break
else:
tds = i.find_all('td')
tmp_td = []
for k in tds:
tmp = k.text.replace('\n', '').strip()
tmp = re.sub('\s{2,}', ' ', tmp)
tmp_td.append(tmp)
if tmp_td:
td.append(tmp_td)
return {"th": th, "td": td}
# 找出所有的带标题class的dom
def find_class(self, html):
titles = html.find_all(self.true_title)
self.titles = titles
# 真正的title
def true_title(self, tag):
if tag.attrs.get('class') and 'TableFoyers2_thtitle'in tag.attrs.get('class'):
for i in self.no:
if tag.text.find(i) != -1:
return True
return False
return False
year = YearlyReportAnalysis()
|标题|标号|
|---|---|
|Summary on OIE-listed diseases/infections present in| 1|
|OIE-listed diseases absent in| 2|
|Detailed quantitative information for OIE-listed diseases/infections present in| 3|
|Unreported OIE-listed diseases during the reporting period| 4|
|Summary on non OIE-Listed diseases/infections present in| 5|
|Non OIE-Listed diseases absent in| 6|
|Detailed quantitative information for non OIE-Listed diseases/infections present in| 7|
|Unreported non OIE-Listed diseases| 8|
|Zoonotic diseases in humans| 9|
|Animal population| 10|
|Veterinarians and veterinary para-professionals| 11|
|National reference laboratories| 12|
|Diagnostic Tests| 13|
|Vaccine Manufacturers| 14|
|Vaccines| 15|
|Vaccine production|16|
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment