re模块(正则表达式)
1、什么是正则
正则就是用一系列具有特殊含义的字符组成一套规则,该规则用来描述具有某一特征的字符串,
正则就是用来去一个大的字符串中匹配出符合规则的子字符串
2、为什么要用正则
1、用户注册
2、爬虫程序
3、如何用正则
import re
print(re.findall('\w','hello 123_ */-='))
['h', 'e', 'l', 'l', 'o', '1', '2', '3', '_']
print(re.findall('\W','hello 123_ */-='))
[' ', ' ', '*', '/', '-', '=']
print(re.findall('\s','hell\no 12\t3_ */-='))
['\n', ' ', '\t', ' ']
print(re.findall('\S','hell\no 12\t3_ */-='))
['h', 'e', 'l', 'l', 'o', '1', '2', '3', '_', '*', '/', '-', '=']
print(re.findall('\d','hell\no 12\t3_ */-='))
['1', '2', '3']
print(re.findall('\D','hell\no 12\t3_ */-='))
['h', 'e', 'l', 'l', '\n', 'o', ' ', '\t', '_', ' ', '*', '/', '-', '=']
print(re.findall('\n','hell\no 12\t3_ */-='))
['\n']
print(re.findall('\t','hell\no 12\t3_ */-='))
['\t']
print(re.findall('l','hell\no 12\t3_ */-='))
['l', 'l']
print(re.findall('egon','my name is egon,egon is beautiful'))
['egon', 'egon']
print(re.findall('^egon','egon my name is egon,egon is beautiful'))
['egon']
print(re.findall('egon$','egon my name is egon,egon is beautifulegon1'))
[]
重复匹配
.:匹配换行符以外的任意一个字符
print(re.findall('a.c','abc a1c aac asd aaaaac a*c a+c abasd')) #['abc','a1c','aac','aac','a*c','a+c']
['abc', 'a1c', 'aac', 'aac', 'a*c', 'a+c']
print(re.findall('a.c','abc a1c aac a\nc asd aaaaac a*c a+c abasd',re.DOTALL))
['abc', 'a1c', 'aac', 'a\nc', 'aac', 'a*c', 'a+c']
[]:匹配一个字符,该字符属于中括号内指定的字符
print(re.findall('a..c','abc a1 c aac asd aaaaac a *c a+c abasd ='))
['a1 c', 'aaac', 'a *c']
print(re.findall('a.c','abc a1 c aac aAc aBc asd aaaaac a-c a/c a *c a+c abasd = a1c a2c'))
['abc', 'aac', 'aAc', 'aBc', 'aac', 'a-c', 'a/c', 'a+c', 'a1c', 'a2c']
print(re.findall('a[a-z]c','abc a1 c aac aAc aBc asd aaaaac a-c a/c a *c a+c abasd = a1c a2c'))
['abc', 'aac', 'aac']
print(re.findall('a[A-Z]c','abc a1 c aac aAc aBc asd aaaaac a-c a/c a *c a+c abasd = a1c a2c'))
['aAc', 'aBc']
print(re.findall('a[-+*/]c','abc a1 c aac aAc aBc asd aaaaac a-c a/c a *c a+c abasd = a1c a2c'))
['a-c', 'a/c', 'a+c']
print(re.findall('a[a-z][a-z]c','abc a1 c aac aAc aBc asd aaaaac a-c a/c a *c a+c abasd = a1c a2c'))
['aaac']
print(re.findall('a[^a-z]c','abc a1 c aac aAc aBc asd aaaaac a-c a/c a *c a+c abasd = a1c a2c'))
['aAc', 'aBc', 'a-c', 'a/c', 'a+c', 'a1c', 'a2c']
*: 必须与其他字符连用,代表左侧的字符出现0次或者无穷次
print(re.findall('ab*','a ab abbb abbbb a1bbbb a-123'))
['a', 'ab', 'abbb', 'abbbb', 'a', 'a']
print(re.findall('ab{0,}','a ab abbb abbbb a1bbbb a-123'))
['a', 'ab', 'abbb', 'abbbb', 'a', 'a']
?: 必须与其他字符连用,代表左侧的字符出现0次或者1次
print(re.findall('ab?','a ab abbb abbbb a1bbbb a-123'))
['a', 'ab', 'ab', 'ab', 'a', 'a']
print(re.findall('ab{0,1}','a ab abbb abbbb a1bbbb a-123'))
['a','ab','ab','ab','a','a']
+: 必须与其他字符连用,代表左侧的字符出现1次或者无穷次
print(re.findall('ab+','a ab abbb abbbb a1bbbb a-123'))
['ab', 'abbb', 'abbbb']
print(re.findall('ab{1,}','a ab abbb abbbb a1bbbb a-123'))
['ab', 'abbb', 'abbbb']
{n,m}: 必须与其他字符连用
print(re.findall('ab{1,3}','a ab abbb abbbb a1bbbb a-123'))
['ab', 'abbb', 'abbb']
.*:贪婪匹配
print(re.findall('a.*c','ab123adfc1134124123adasfc123123'))
['ab123adfc1134124123adasfc']
.*?:非贪婪匹配
print(re.findall('a.*?c','ab123adfc1134124123adasfc123123'))
['ab123adfc', 'adasfc']
():分组
print(re.findall('expression="(.*?)"','expression="1+2+3/4*5" egon="beautiful"'))
['1+2+3/4*5']
print(re.findall('href="(.*?)"','<p>段落</p><a href="https://www.sb.com">点我啊</a><h1>标题</h1><a href="https://www.sb.com">点我啊</a>'))
['https://www.sb.com', 'https://www.sb.com']
|:
print(re.findall('a|b','ab123abasdfaf'))
['a', 'b', 'a', 'b', 'a', 'a']
print(re.findall('compan(?:ies|y)','Too many companies have gone bankrupt, and the next one is my company'))
['companies', 'company']
companies company
print(re.findall(r'a\\c','a\c a1c aAc aac'))
['a\\c']
print(re.findall('a\\\\c','a\c a1c aAc aac'))
['a\\c']
print(re.findall('ale(x)','alex is SB,alex is bigSB'))
['x', 'x']
print(re.search('alex','alex is SB,alex is bigSB'))
<_sre.SRE_Match object; span=(0, 4), match='alex'>
print(re.search('ale(x)','alex is SB,alex is bigSB').group())
alex
print(re.search('abcdefg','alex is SB,alex is bigSB'))
None
print(re.search('^alex','123alex is SB,alex is bigSB'))
None
print(re.match('alex','123alex is SB,alex is bigSB'))
None
l='egon:18:male'.split(':')
print(l)
['egon', '18', 'male']
l1=re.split('[ :/-]','a-b/c egon:18:male xxx')
print(l1)
['a', 'b', 'c', 'egon', '18', 'male', 'xxx']
print(re.sub('[a-z]+xx','yxp','lxx is good,sb is lllxx wxx is good cxx is good'))
yxp is good,sb is yxp yxp is good yxp is good
pattern=re.compile('alex')
print(pattern.findall('alex is SB,alex is bigSB'))
['alex', 'alex']
print(pattern.search('alex is SB,alex is bigSB'))
<_sre.SRE_Match object; span=(0, 4), match='alex'>