完成1-3标题解析入库

bf6b0f0f · 齐昊宇 · bf6b0f0f · bf6b0f0f · bf6b0f0f · bf6b0f0f
Commit bf6b0f0f authored Sep 29, 2019 by 齐昊宇
29 changed files
--- a/.idea/codeStyles/codeStyleConfig.xml
+++ b/.idea/codeStyles/codeStyleConfig.xml
+<component name="ProjectCodeStyleConfiguration">
+  <state>
+    <option name="PREFERRED_PROJECT_CODE_STYLE" value="Default" />
+  </state>
+</component>
\ No newline at end of file
--- a/.idea/dataSources.local.xml
+++ b/.idea/dataSources.local.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="dataSourceStorageLocal">
+    <data-source name="postgres@39.104.109.13" uuid="16830a47-a264-4867-9152-47f09ec57145">
+      <database-info product="PostgreSQL" version="9.5.12" jdbc-version="4.2" driver-name="PostgreSQL JDBC Driver" driver-version="42.2.5" dbms="POSTGRES" exact-version="9.5.12" exact-driver-version="42.2">
+        <identifier-quote-string>&quot;</identifier-quote-string>
+      </database-info>
+      <case-sensitivity plain-identifiers="lower" quoted-identifiers="exact" />
+      <secret-storage>master_key</secret-storage>
+      <user-name>postgres</user-name>
+      <introspection-schemas>animal_new:@,public</introspection-schemas>
+    </data-source>
+  </component>
+</project>
\ No newline at end of file
--- a/.idea/dataSources.xml
+++ b/.idea/dataSources.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="DataSourceManagerImpl" format="xml" multifile-model="true">
+    <data-source source="LOCAL" name="postgres@39.104.109.13" uuid="16830a47-a264-4867-9152-47f09ec57145">
+      <driver-ref>postgresql</driver-ref>
+      <synchronize>true</synchronize>
+      <jdbc-driver>org.postgresql.Driver</jdbc-driver>
+      <jdbc-url>jdbc:postgresql://39.104.109.13:5432/postgres</jdbc-url>
+    </data-source>
+  </component>
+</project>
\ No newline at end of file
--- a/.idea/dataSources/16830a47-a264-4867-9152-47f09ec57145.xml
+++ b/.idea/dataSources/16830a47-a264-4867-9152-47f09ec57145.xml
--- a/.idea/dataSources/16830a47-a264-4867-9152-47f09ec57145/storage_v2/_src_/database/animal_new.3QhDEg.meta
+++ b/.idea/dataSources/16830a47-a264-4867-9152-47f09ec57145/storage_v2/_src_/database/animal_new.3QhDEg.meta
+#n:animal_new
\ No newline at end of file
--- a/.idea/dataSources/16830a47-a264-4867-9152-47f09ec57145/storage_v2/_src_/database/animal_new.3QhDEg/schema/oie.y60BAA.meta
+++ b/.idea/dataSources/16830a47-a264-4867-9152-47f09ec57145/storage_v2/_src_/database/animal_new.3QhDEg/schema/oie.y60BAA.meta
+#n:oie
+!<md> [null, 0, null, null, -2147483648, -2147483648]
--- a/.idea/dataSources/16830a47-a264-4867-9152-47f09ec57145/storage_v2/_src_/database/animal_new.3QhDEg/schema/public.abK9xQ.meta
+++ b/.idea/dataSources/16830a47-a264-4867-9152-47f09ec57145/storage_v2/_src_/database/animal_new.3QhDEg/schema/public.abK9xQ.meta
+#n:public
+!<md> [19114362, 0, null, null, -2147483648, -2147483648]
--- a/.idea/dataSources/16830a47-a264-4867-9152-47f09ec57145/storage_v2/_src_/database/animal_new.3QhDEg/schema/public.abK9xQ.zip
+++ b/.idea/dataSources/16830a47-a264-4867-9152-47f09ec57145/storage_v2/_src_/database/animal_new.3QhDEg/schema/public.abK9xQ.zip
--- a/.idea/deployment.xml
+++ b/.idea/deployment.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="PublishConfigData">
+    <serverData>
+      <paths name="root@39.104.86.100:22">
+        <serverdata>
+          <mappings>
+            <mapping local="$PROJECT_DIR$" web="/" />
+          </mappings>
+        </serverdata>
+      </paths>
+    </serverData>
+  </component>
+</project>
\ No newline at end of file
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="JavaScriptSettings">
+    <option name="languageLevel" value="ES6" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6 (python-project)" project-jdk-type="Python SDK" />
+</project>
\ No newline at end of file
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/oie-yearly-report.iml" filepath="$PROJECT_DIR$/.idea/oie-yearly-report.iml" />
+    </modules>
+  </component>
+</project>
\ No newline at end of file
--- a/.idea/oie-yearly-report.iml
+++ b/.idea/oie-yearly-report.iml
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="PROJECT_TEST_RUNNER" value="Unittests" />
+  </component>
+</module>
\ No newline at end of file
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
--- a/OIE World Animal Health Information System.pdf
+++ b/OIE World Animal Health Information System.pdf
--- a/YEAR.html
+++ b/YEAR.html
--- a/__pycache__/yearly_report_analysis.cpython-36.pyc
+++ b/__pycache__/yearly_report_analysis.cpython-36.pyc
--- a/ana.log
+++ b/ana.log
--- a/err.log
+++ b/err.log
--- a/have_7.html
+++ b/have_7.html
--- a/main.py
+++ b/main.py
+from postgre.utils import util as pysql
+from yearly_report_analysis import year
+summary_col_dict = {
+    'OIE-Listed disease': 'oie_listed_disease',
+    'occurrence': 'occurrence',
+    'Serotype(s)': 'serotype',
+    'New outbreaks': 'new_outbreaks',
+    'Total outbreaks': 'total_outbreaks',
+    'Species': 'species',
+    'Control Measures': 'control_measures',
+    'Official vaccination': 'official_vaccination',
+    'Measuring units': 'measuring_units',
+    'Susceptible': 'susceptible',
+    'Cases': 'cases',
+    'Deaths': 'deaths',
+    'Killed and disposed of': 'killed_and_disposed_of',
+    'Slaughtered': 'slaughtered',
+    'Vaccination in response to the outbreak(s)': 'vaccination_in_response_to_the_outbreak',
+    'id': 'disease_id'
+}
+
+
+select_length = pysql.exec('SELECT COUNT(*) FROM public.oie_yearlyreport_new')
+length = select_length[0]
+
+
+def sub_select(length, disease_dict):
+    global_start = 500
+    for i in range(global_start, length, 10):
+        count = 10
+        start = i
+        with open('ana.log', 'a', encoding='utf-8') as file:
+            file.write('正在处理' + str(i) + '-' + str(i+9) + '  共' + str(length) + '条\n')
+        print('正在处理' + str(i) + '-' + str(i+10) + '  共' + str(length) + '条')
+        select_res = pysql.selectData(('country_code', 'year', 'type', 'json_text', 'id'), 'oie_yearlyreport_new', False, (count, start))
+        k = 1
+        for j in select_res:
+            with open('ana.log', 'a', encoding='utf-8') as file:
+                file.write('处理第' + str(k) + '条中……\n')
+                print('处理第' + str(k) + '条中……')
+                a_html = year.analysis(j['json_text'])
+                result = handle_all_analysis(a_html, disease_dict, j['id'], j['country_code'], j['year'], j['type'])
+                if result:
+                    print('第' + str(k) + '条处理完成')
+                    file.write('第' + str(k) + '条处理完成\n')
+                else:
+                    print('第' + str(k) + '条处理出现问题，已记录')
+                    file.write('第' + str(k) + '条处理出现问题，已记录\n')
+                k = k + 1
+
+
+def handle_all_analysis(a_html, disease_dict, year_report_id, country_code, year, type):
+    print(year_report_id)
+    if a_html == []:
+        with open('err.log', 'a', encoding='utf-8') as file:
+            file.writelines(str([a_html, year_report_id, country_code, year, type]) + '\n')
+        return False
+    table_1 = a_html[0]
+    table_3 = a_html[1]
+    table_1 = handle_a_table_1(table_1, [0, 1, 2], disease_dict)
+    for i in table_1:
+        tmp = {}
+        for key, value in i.items():
+            tmp[summary_col_dict[key]] = '...' if value == '' else value
+        tmp['year_report_id'] = year_report_id
+        tmp['country_code'] = country_code
+        tmp['year'] = year
+        tmp['type'] = type
+        if is_exist('oie_year_report_summary', ['oie_listed_disease'], tmp):
+            with open('ana.log', 'a', encoding='utf-8') as file:
+                file.write('已存在:' + str(tmp) + '\n')
+            print('已存在:' + str(tmp))
+            continue
+        else:
+            insert_into_summary('oie_year_report_summary', tmp)
+    table_3 = handle_a_table_3(table_3)
+    return True
+
+
+def handle_1(json):
+    pass
+
+
+def handle_a_table_1(table, auto_col, disease_dict):
+    auto_col_data = {}
+    ok_table = []
+    for tr in table:
+        if len(table[tr]) < 1:
+            continue
+        for td in table[tr]['td']:
+            if len(td) < 2:
+                continue
+            if td[0] != '':
+                auto_col_data = {}
+            for i in auto_col:
+                if td[i].strip() != '':
+                    td[i] = td[i].replace(' (Domestic and Wild)', '')
+                    td[i] = td[i].replace(' (Domestic)', '')
+                    td[i] = td[i].replace(' (Wild)', '')
+                    auto_col_data[i] = td[i]
+                else:
+                    if i not in auto_col_data:
+                        auto_col_data[i] = ''
+                    td[i] = auto_col_data[i]
+            tmp_td = {}
+            j = 0
+            for i in table[tr]['th']:
+                tmp_td[i] = td[j]
+                j = j + 1
+            disease_name = tmp_td['OIE-Listed disease']
+            disease_name = disease_name.replace(' (Domestic and Wild)', '')
+            disease_name = disease_name.replace(' (Domestic)', '')
+            disease_name = disease_name.replace(' (Wild)', '')
+            if (disease_name not in disease_dict):
+                pysql.insertData({'oiednameen': disease_name}, 'oie_disease')
+                disease_dict = get_disease_list()
+            tmp_td['id'] = disease_dict[disease_name]
+            ok_table.append(tmp_td)
+    return ok_table
+
+def handle_a_table_3(title):
+    ok_table = title
+    # for a_title in title:
+    #     for td in title[a_title]:
+    #         print(td)
+    return ok_table
+
+
+# 获得所有疫病的list
+def get_disease_list():
+    disease_list = pysql.selectData(['oiednameen', 'id'], 'oie_disease')
+    disease_detail = {}
+    for i in disease_list:
+        disease_detail[i['oiednameen']] = i['id']
+    return disease_detail
+
+
+def insert_into_summary(table_name, data):
+    pysql.insertData(data, table_name)
+
+def is_exist (table_name, col, data):
+    exsist = pysql.selectData(col, table_name, data)
+    if exsist:
+        return True
+    else:
+        return False
+
+
+
+disease_dict = get_disease_list()
+sub_select(length, disease_dict)
--- a/postgre/__init__.py
+++ b/postgre/__init__.py
--- a/postgre/__pycache__/__init__.cpython-36.pyc
+++ b/postgre/__pycache__/__init__.cpython-36.pyc
--- a/postgre/__pycache__/utils.cpython-36.pyc
+++ b/postgre/__pycache__/utils.cpython-36.pyc
--- a/postgre/utils.py
+++ b/postgre/utils.py
+# 导入psycopg2包
+import psycopg2
+import threading
+tableInfos = {
+  "oie_yearlyreport_new": {
+    "id": "int",
+    "country_id": "int",
+    "country_name": "string",
+    "json_text": "string",
+    "create_time": "date",
+    "update_time": "date",
+    "year": "string",
+    "type": "string",
+    "country_code": "string",
+    "url": "string",
+    "target_file": "string"
+  },
+  'oie_disease': {
+      'oiednameen': 'string'
+  },
+  'oie_year_report_summary': {  # 1
+    'oie_listed_disease': 'string',
+    'occurrence': 'string',
+    'serotype': 'string',
+    'new_outbreaks': 'string',
+    'total_outbreaks': 'string',
+    'species': 'string',
+    'control_measures': 'string',
+    'official_vaccination': 'string',
+    'measuring_units': 'string',
+    'susceptible': 'string',
+    'cases': 'string',
+    'deaths': 'string',
+    'killed_and_disposed_of': 'string',
+    'slaughtered': 'string',
+    'vaccination_in_response_to_the_outbreak': 'string',
+    'disease_id': 'string',
+    'year_report_id': 'string',
+    'country_code': 'string',
+    'year': 'string',
+    'type': 'string'
+  },
+  "oie_year_report_never_report": {  # 2
+    "animal": 'string',
+    "country_code": 'string',
+    "year": "int",
+    "type": "string",
+    "disease": "string",
+    "date_of_last_occurrence": 'string',
+    "species": "string",
+    "control_measures": "string",
+    "disease_id": "int",
+    "year_report_id": "int",
+    "official_vaccination": "string",
+    "id": "int"
+  },
+  "oie_year_report_detail": {  # 3
+    'serotype': 'string',
+    'new_outbreaks': 'string',
+    'total_outbreaks': 'string',
+    'species': 'string',
+    'family_name': 'string',
+    'latin_name': 'string',
+    'measuring_units': 'string',
+    'susceptible': 'string',
+    'cases': 'string',
+    'deaths': 'string',
+    'killed_and_disposed_of': 'string',
+    'slaughtered': 'string',
+    'vaccination_in_response_to_the_outbreak': 'string',
+    'disease_id': 'int',
+    'disease': 'string',
+    'type': 'string',
+    'country_code': 'string',
+    'year': 'int',
+    'year_report_id': 'int',
+    'month': 'string',
+    'state': 'string'
+  }
+}
+
+class psqlUtil(object):
+
+    def open(self):
+        # if not hasattr(self, "conn"):
+        # 连接到一个给定的数据库
+        self.conn = psycopg2.connect(database="animal_new", user="postgres", password="postgres", host="39.104.109.13", port="5432")
+        # 建立游标，用来执行数据库操作
+        self.cursor = self.conn.cursor()
+        # print("__init__.py", self.conn)
+
+    def close(self):
+        # 关闭游标
+        self.cursor.close()
+        # 关闭数据库连接
+        self.conn.close()
+
+    def insertData(self, data, tableName):
+        self.open()
+        sql = "INSERT INTO public." + tableName + " ("
+        for key, info in data.items():
+            sql += key + ","
+        sql = sql[0:-1:1]
+        sql += ") values ("
+        for key, value in data.items():
+            sql += getValueFromInfo(key, value, tableName) + ","
+        sql = sql[0:-1:1]
+        sql += ")"
+        # print(sql)
+        self.cursor.execute(sql)
+        self.conn.commit()
+
+    def selectData(self, fields, tableName, where = False,limit = False):
+        self.open()
+        sql = "SELECT "
+        for f in fields:
+            sql += f + ","
+        sql = sql[:-1:1]
+        sql += " FROM public." + tableName
+        if where:
+            if len(where.items()) != 0:
+                sql += " WHERE "
+                for key, value in where.items():
+                    sql += key + " = " + getValueFromInfo(key, value, tableName) + " AND "
+                sql = sql[0:-4:1]
+        if limit:
+            sql = sql + ' LIMIT ' + str(limit[0]) + ' OFFSET ' + str(limit[1])
+        self.cursor.execute(sql)
+        rows = self.cursor.fetchall()
+        data = []
+        for row in rows:
+            info = {}
+            for idx, value in enumerate(fields):
+                info[value] = row[idx]
+            data.append(info)
+        return data
+
+    def exec(self, sql):
+        self.open()
+        self.cursor.execute(sql)
+        rows = self.cursor.fetchone()
+        return rows
+
+# 根据字段类型获取
+def getValueFromInfo(key, value, tableName):
+    fieldsInfo = tableInfos[tableName]
+    typeInfo = fieldsInfo[key]
+    if(typeInfo == "int" or typeInfo == "float"):
+        return str(value)
+    if(typeInfo == "string" or typeInfo == "date"):
+        value = str(value).replace("'", "''")
+        return "'" + str(value) + "'"
+
+util = psqlUtil()
--- a/test.py
+++ b/test.py
--- a/year1.html
+++ b/year1.html
--- a/yearly_report_analysis.py
+++ b/yearly_report_analysis.py
+from bs4 import BeautifulSoup
+import re
+
+
+class YearlyReportAnalysis():
+    html = ''
+    titles = []  # 所需要的title
+    no = [
+        'Summary on OIE-listed diseases/infections present in',  # 1
+        'OIE-listed diseases absent in',  # 2
+        'Detailed quantitative information for OIE-listed diseases/infections present in',  # 3
+        'Unreported OIE-listed diseases during the reporting period',  # 4
+        'Summary on non OIE-Listed diseases/infections present in',  # 5
+        'Non OIE-Listed diseases absent in',  # 6
+        'Detailed quantitative information for non OIE-Listed diseases/infections present in',  # 7
+        'Unreported non OIE-Listed diseases',  # 8
+        'Zoonotic diseases in humans',  # 9
+        'Animal population',  # 10
+        'Veterinarians and veterinary para-professionals',  # 11,
+        'National reference laboratories',  # 12
+        'Diagnostic Tests',  # 13
+        'Vaccine Manufacturers',  # 14
+        'Vaccines',  # 15
+        'Vaccine production'  # 16
+          ]
+    bigTitle = [3, 7]
+
+    def __init__(self):
+        pass
+
+    def analysis(self, html):
+        self.html = html
+        self.init_analysis()
+        return self.analysis_1()
+
+    def init_analysis(self):
+        html = BeautifulSoup(self.html, features='html.parser')
+        self.find_class(html)
+
+    # 解析入口
+    def analysis_1(self):
+        result = []
+        for analysis_html in self.titles:
+            if analysis_html.name == 'td':
+                res = self.analysis_title_is_td(analysis_html)
+            else:
+                res = self.analysis_title_is_div(analysis_html)
+            result.append({str(analysis_html.text.strip()): res})
+        return result
+
+    def analysis_which_title(self, title):
+        j = 1
+        for i in self.no:
+            if title.text.find(i) != -1:
+                return j
+            j = j + 1
+        raise TypeError('Number will be None! Please check')
+
+    # 当表头是table时的处理函数
+    def analysis_title_is_td(self, html):
+        number = self.analysis_which_title(html)
+        tr = html.parent
+        table = tr.parent
+        th = table.find('th')
+        tr = th.parent
+        table = self.analysis_table(tr)
+        table['type'] = 'table'
+        table['no'] = number
+        return table
+
+    # 当表头是div的处理函数
+    def analysis_title_is_div(self, html):
+        tables = {}
+        number = self.analysis_which_title(html)
+        if number in self.bigTitle:
+            special = 3  # 大title的时候是3
+            tables['type'] = 'big'
+        else:
+            special = 2  # 小title的时候是2
+            tables['type'] = 'usual'
+        tables['no'] = number
+        tables['tables'] = []
+        tables_html = self.analysis_title_is_div_tables(html)
+        big_title = ''
+        for i in tables_html:
+            if callable(i.strip):
+                pass
+            else:
+                type = i.find(class_='TableFoyers2_thtitle').text.strip()
+                th = i.find('th')
+                if not th:
+                    tr = i.find('tr').next_sibling.next_sibling
+                else:
+                    tr = th.parent
+                table = self.analysis_table(tr)
+                if special == 2:
+                    title = i.find(class_='TableFoyers2_thtitle')
+                    title = title.contents[0].strip()
+                    tables['tables'].append({title: table})
+                elif special == 3:
+                    title = i.previous_sibling.previous_sibling
+                    title = title.contents[0].strip()
+                    title = title.replace(' (Domestic and Wild)', '')
+                    title = title.replace(' (Domestic)', '')
+                    title = title.replace(' (Wild)', '')
+                    if title != '':
+                        big_title = title
+                    else:
+                        title = big_title
+                    tables['tables'].append({title: {'type': type, 'table': table}})
+                else:
+                    tables['tables'].append(table)
+        return tables
+
+    # 表头是div的情况下，获取下面的所有table
+    def analysis_title_is_div_tables(self, html):
+        table = []
+        for i in html.next_siblings:
+            if callable(i.strip):
+                pass
+            elif i in self.titles:
+                break
+            else:
+                if i.find(class_='TableFoyers2_thtitle') in self.titles:
+                    break
+                if i.name == 'table':
+                    table.append(i)
+        return table
+
+    # 拆分表格
+    def analysis_table(self, th_html):
+        th = []
+        td = []
+        ths = th_html.find_all('th')
+        if ths == []:
+            ths = th_html.find_all('td')
+            tmp_th_td = []
+            for j in ths:
+                j = j.text.strip()
+                j = j.replace('\n', '')
+                j = re.sub('\s{2,}', ' ', j)
+                tmp_th_td.append(j)
+            td.append(tmp_th_td)
+        else:
+            for j in ths:
+                th.append(j.text.strip())
+        for i in th_html.next_siblings:
+            if callable(i.strip):
+                pass
+            elif i in self.titles:
+                break
+            else:
+                tds = i.find_all('td')
+                tmp_td = []
+                for k in tds:
+                    tmp = k.text.replace('\n', '').strip()
+                    tmp = re.sub('\s{2,}', ' ', tmp)
+                    tmp_td.append(tmp)
+                if tmp_td:
+                    td.append(tmp_td)
+        return {"th": th, "td": td}
+
+    # 找出所有的带标题class的dom
+    def find_class(self, html):
+        titles = html.find_all(self.true_title)
+        self.titles = titles
+
+    # 真正的title
+    def true_title(self, tag):
+        if tag.attrs.get('class') and 'TableFoyers2_thtitle'in tag.attrs.get('class'):
+            for i in self.no:
+                if tag.text.find(i) != -1:
+                    return True
+            return False
+        return False
+
+
+year = YearlyReportAnalysis()
--- a/各标题状况.xlsx
+++ b/各标题状况.xlsx
--- a/标题对应.md
+++ b/标题对应.md
+|标题|标号|
+|---|---|
+|Summary on OIE-listed diseases/infections present in| 1|
+|OIE-listed diseases absent in| 2|
+|Detailed quantitative information for OIE-listed diseases/infections present in| 3|
+|Unreported OIE-listed diseases during the reporting period| 4|
+|Summary on non OIE-Listed diseases/infections present in| 5|
+|Non OIE-Listed diseases absent in| 6|
+|Detailed quantitative information for non OIE-Listed diseases/infections present in| 7|
+|Unreported non OIE-Listed diseases| 8|
+|Zoonotic diseases in humans| 9|
+|Animal population| 10|
+|Veterinarians and veterinary para-professionals| 11|
+|National reference laboratories| 12|
+|Diagnostic Tests| 13|
+|Vaccine Manufacturers| 14|
+|Vaccines| 15|
+|Vaccine production|16|