handle word and excel simply

EagleYing · Apr 15, 2019 · 4b69504 · 4b69504
1 parent 246767b
commit 4b69504
Showing 1 changed file with 205 additions and 0 deletions.
diff --git a/docop.py b/docop.py
@@ -0,0 +1,205 @@
+# encoding=utf-8
+import docx
+import collections
+import jieba
+import re
+import string
+import xlwt
+import os
+
+def CountWordsNum(file):
+    result = [0, 0, 0, 0, 0, 0, 0, 0]
+    dict = ['相关公众','公众','相关消费者','普通消费者','消费者','普通购买者','购买者','他人']
+    PATTEN = ["[\u4e00-\u9fa5]?(相关公众)", 
+          "[\u4e00-\u9fa5]?(公众)",
+          "[\u4e00-\u9fa5]?(相关消费者)",
+          "[\u4e00-\u9fa5]?(普通消费者)",
+          "[\u4e00-\u9fa5]?(消费者)",
+          "[\u4e00-\u9fa5]?(普通购买者)",
+          "[\u4e00-\u9fa5]?(购买者)",
+          "[\u4e00-\u9fa5]?(他人)"
+         ]
+    for p in file.paragraphs:
+    #words = jieba.cut(p.text, cut_all = True)
+    #for w in words:
+    #    if w == "相关公众":
+    #        print("once!")
+        i = 0
+        for patten in PATTEN:
+            pa = re.compile(patten)
+            m = pa.findall(p.text)
+            if len(m) != 0:
+                #print(m)
+                result[i] += len(m)
+            i += 1
+    result[1] = result[1] - result[0]
+    result[4] = result[4] - result[3] - result[2]
+    result[6] = result[6] - result[5]
+
+    maxnum = max(result)
+    resstr = ""
+    j = 0
+    for r in result:
+        if r == maxnum:
+           resstr = resstr + dict[j] + " "
+        j += 1
+    return resstr
+
+def JudgeCorporation(file):
+    k = 0
+    for p in file.paragraphs:
+        if len(p.text) != 0:
+            break
+        k += 1
+    resstr = []
+    title = file.paragraphs[k]
+    title = title.text
+    subtitle = title.split("诉")
+    PATTEN = "[\u4e00-\u9fa5]?(公司)"
+    patten = re.compile(PATTEN)
+
+    res1 = patten.search(subtitle[0])
+    if res1 == None:
+        print("原告自然人")
+        resstr.append("原告自然人")
+    else:
+        print("原告法人")
+        resstr.append("原告法人")
+
+    res2 = patten.search(subtitle[1])
+    if res2 == None:
+        print("被告自然人")
+        resstr.append("被告自然人")
+    else:
+        print("被告法人")
+        resstr.append("被告法人")
+    return resstr
+
+
+def CharacterofCase(file):
+    start = 0
+    end = 0
+    j = 0
+
+    for p in file.paragraphs:
+        #去除空格
+        s = p.text.split()
+        if len(s) != 0:
+            if s[0] == "【裁判理由】":
+                start = j
+            if s[0] == "本案法律依据":
+                end = j
+                break
+        j += 1
+
+    PATTEN1 = "[\u4e00-\u9fa5]?(混淆)"
+    PATTEN2 = "[\u4e00-\u9fa5]?(近似)"
+    PATTEN3 = "[\u4e00-\u9fa5]?(。)"
+    patten1 = re.compile(PATTEN1)
+    patten2 = re.compile(PATTEN2)
+    patteb3 = re.compile(PATTEN3)
+    flag1 = 0
+    flag2 = 0
+
+    for n in range(start + 1, end):
+        p = file.paragraphs[n]
+        res1 = patten1.search(p.text)
+        res2 = patten2.search(p.text)
+        if res1 != None:
+            flag1 = 1
+        if res2 != None:
+            flag2 = 1
+
+    can_judge = 0
+    if flag1 == 1 and flag2 == 0:
+        print("混淆标准")
+        return("混淆标准")
+    elif flag1 == 0 and flag2 == 1:
+        print("近似标准")
+        return("近似标准")
+    elif flag1 == 0 and flag2 == 0:
+        print("无关案件")
+        return("无关案件")
+    else:
+        for n in range(start + 1, end):
+            p = file.paragraphs[n].text
+            subp = p.split("。")
+            for sp in subp:
+                res_hx = patten1.search(sp)
+                res_js = patten2.search(sp)
+                if res_hx != None and res_js != None:
+                    doublesub = sp.split("混淆")
+                    res_former = patten2.search(doublesub[0])
+                    if res_former == None:
+                        print("混淆导致相似")
+                        can_judge = 1
+                        return("混淆导致相似")
+                    else: 
+                        print("相似导致混淆")
+                        can_judge = 1
+                        return("相似导致混淆")
+
+        if can_judge == 0:
+            print("无法判断")
+            return("无法判断")
+
+def set_style(name, height, bold=False):
+    style = xlwt.XFStyle()   # 初始化样式
+    font = xlwt.Font()       # 为样式创建字体
+    font.name = name
+    font.bold = bold
+    font.color_index = 4
+    font.height = height
+
+    style.font = font
+    return style
+
+
+def write_excel(path, rows):
+    #创建工作表
+    workbook = xlwt.Workbook(encoding='utf-8')
+    #创建sheet
+    sheet = workbook.add_sheet('sheet1')
+    firstline = ["原告性质", "被告性质", "最高频词汇", "案件性质"]
+    for col in range(len(firstline)):
+        sheet.write(0, col, firstline[col], set_style('黑体', 220, True))
+    rownum = 1
+    for r in rows:
+        col = 0
+        for rr in r:
+            sheet.write(rownum, col, rr)
+            col += 1
+        rownum += 1
+    workbook.save(path)
+
+def ScanFile(filepath):
+    allfiles = os.listdir(filepath)
+    return allfiles
+
+if __name__ == '__main__':
+    #Open file .docx
+    #file = docx.Document('/Users/eagleying/Downloads/Cases/指导案例82号 王碎永诉深圳歌力思服饰股份有限公司、杭州银泰世纪百货有限公司侵害商标权纠纷案.docx')
+    #print("段落数:"+str(len(file.paragraphs)))
+    print("请输入您word案例所在的路径,例如:")
+    print("Mac用户: /Users/eagleying/Downloads/Cases/")
+    print("Windows用户: D:\\Case\\")
+    filepath = input()
+    print("请输入Excel文件名,如result.xls")
+    filename = input()
+    print("请输入生成Excel文件的路径:")
+    excelpath = input()
+    #filepath = '/Users/eagleying/Downloads/Cases/'
+    allfiles = ScanFile(filepath)
+    rows = []
+    for f in allfiles:
+        file = docx.Document(filepath + f)
+        row = []
+        for r in JudgeCorporation(file):
+            row.append(r)
+        row.append(CountWordsNum(file))
+        row.append(CharacterofCase(file))
+        rows.append(row)
+    #path = '/Users/eagleying/Downloads/resuults.xls'
+    path = excelpath + filename
+    write_excel(path, rows)
+    print("生成成功^o^")