這個單子因爲時間以及在家的緣故,做一半就沒時間了,還有就是遇到一個問題,當正則表達式修改之後格式有點看不懂,需要重新append到列表進行循環遍歷輸出,這樣才更好進行再修改
import pandas as pd
import numpy as np
import re
import matplotlib. pyplot as plt
% matplotlib inline
raw= pd. read_excel( '彙總.xlsx' )
raw. head( )
txt
0
PT P
1
PN JP2019213676-A
2
TI Odor modulation agent useful for modulating...
3
AU BAO X X
4
TSUJIMOTO H
chapnum = 0
for i in range ( len ( raw) ) :
if raw[ 'txt' ] [ i] == "PT P" :
chapnum += 1
raw. loc[ i, 'chap' ] = chapnum
rawgrp = raw. groupby( 'chap' )
chapter = rawgrp. agg( sum )
chapter. head( )
txt
chap
1.0
PT PPN JP2019213676-ATI Odor modulation agent ...
2.0
PT PPN WO2019237159-A1TI Treating solid waste ...
3.0
PT PPN US2019382319-A1; EP3581551-A1TI Bacteri...
4.0
PT PPN WO2019237134-A1TI Aerobic waste compost...
5.0
PT PPN CN110550724-ATI Biological filler usefu...
def pn ( x) :
patten= re. compile ( 'PN (.*?)TI' , re. S)
a= re. findall( patten, x)
return a
chapter[ 'pn' ] = chapter. apply ( lambda x: pn( x[ 'txt' ] ) , axis= 1 )
def UT ( x) :
patten= re. compile ( 'UT (.*?)ER' , re. S)
return re. findall( patten, x)
chapter[ 'UT' ] = chapter. apply ( lambda x: UT( x[ 'txt' ] ) , axis= 1 )
def TI ( x) :
patten= re. compile ( 'TI (.*?)AU' , re. S)
return re. findall( patten, x)
chapter[ 'TI' ] = chapter. apply ( lambda x: TI( x[ 'txt' ] ) , axis= 1 )
chapter. head( )
txt
pn
UT
TI
chap
1.0
PT PPN JP2019213676-ATI Odor modulation agent ...
[JP2019213676-A]
[DIIDW:2019A6103W]
[Odor modulation agent useful for modulating s...
2.0
PT PPN WO2019237159-A1TI Treating solid waste ...
[WO2019237159-A1]
[DIIDW:2019A6029L]
[Treating solid waste extracted from liquid wa...
3.0
PT PPN US2019382319-A1; EP3581551-A1TI Bacteri...
[US2019382319-A1; EP3581551-A1]
[DIIDW:2019A6044K]
[Bacterially decomposing organic waste materia...
4.0
PT PPN WO2019237134-A1TI Aerobic waste compost...
[WO2019237134-A1]
[DIIDW:2019A4913N]
[Aerobic waste composting chamber used for sol...
5.0
PT PPN CN110550724-ATI Biological filler usefu...
[CN110550724-A]
[DIIDW:2019A5191S]
[Biological filler useful for treating livesto...
chapter[ 'pn' ] = chapter[ 'pn' ] . str . replace( '[' , '' )
chapter. head( )
txt
pn
UT
TI
chap
1.0
PT PPN JP2019213676-ATI Odor modulation agent ...
NaN
[DIIDW:2019A6103W]
[Odor modulation agent useful for modulating s...
2.0
PT PPN WO2019237159-A1TI Treating solid waste ...
NaN
[DIIDW:2019A6029L]
[Treating solid waste extracted from liquid wa...
3.0
PT PPN US2019382319-A1; EP3581551-A1TI Bacteri...
NaN
[DIIDW:2019A6044K]
[Bacterially decomposing organic waste materia...
4.0
PT PPN WO2019237134-A1TI Aerobic waste compost...
NaN
[DIIDW:2019A4913N]
[Aerobic waste composting chamber used for sol...
5.0
PT PPN CN110550724-ATI Biological filler usefu...
NaN
[DIIDW:2019A5191S]
[Biological filler useful for treating livesto...
def UT ( x) :
patten= re. compile ( '\d\d\d\d' , re. S)
a= re. findall( patten, str ( x) )
return str ( a) . split( "," ) [ 0 ]
chapter[ 'UT' ] = chapter. apply ( lambda x: UT( x[ 'UT' ] ) , axis= 1 )
chapter[ 'UT' ] = chapter. apply ( lambda x: UT( x[ 'UT' ] ) , axis= 1 )
chapter= chapter. drop( [ 'txt' ] , axis= 1 )
chapter. head( )
pn
UT
TI
chap
1.0
NaN
['2019']
[Odor modulation agent useful for modulating s...
2.0
NaN
['2019']
[Treating solid waste extracted from liquid wa...
3.0
NaN
['2019']
[Bacterially decomposing organic waste materia...
4.0
NaN
['2019']
[Aerobic waste composting chamber used for sol...
5.0
NaN
['2019']
[Biological filler useful for treating livesto...
chapter[ 'pn' ] = chapter[ 'pn' ] . str . extract( '([^\[\]\']+)' )
chapter[ 'UT' ] = chapter[ 'UT' ] . str . extract( '([^\[\]\']+)' )
chapter. head( )
pn
UT
TI
chap
1.0
NaN
2019
[Odor modulation agent useful for modulating s...
2.0
NaN
2019
[Treating solid waste extracted from liquid wa...
3.0
NaN
2019
[Bacterially decomposing organic waste materia...
4.0
NaN
2019
[Aerobic waste composting chamber used for sol...
5.0
NaN
2019
[Biological filler useful for treating livesto...
爬出從1963年到2019年,每年專利的數量,列excel表
chapter. dropna( inplace= True )
chapter[ 'UT' ] = chapter[ 'UT' ] . drop( chapter[ chapter[ 'UT' ] == '5575' ] . index)
request1= chapter. groupby( by= 'UT' ) [ 'pn' ] . count( )
request1. head( )
UT
1981 1
1984 1
1990 3
1991 1
1992 2
Name: pn, dtype: int64
from pylab import mpl
mpl. rcParams[ 'font.sans-serif' ] = [ 'FangSong' ]
mpl. rcParams[ 'axes.unicode_minus' ] = False
asd, sdf = plt. subplots( 1 , 1 , dpi= 100 )
request1. plot( kind= 'bar' , title= '數量分佈' , ax= sdf)
plt. show( )
各個國家和前十的機構,都申請了多少專利
chapter[ 'organization' ] = chapter[ 'pn' ] . str [ : 2 ]
chapter[ 'organization' ] . value_counts( )
CN 275
KR 22
WO 15
ID 11
JP 11
EP 7
IN 6
DE 5
FR 4
US 4
RU 4
BR 3
NL 2
GB 1
PL 1
PH 1
TW 1
Name: organization, dtype: int64