String find

In [1]:
s = 'hello world :)'
s.find('hel')
Out[1]:
0
In [33]:
s = 'hello world :)'
s.find('hek')
Out[33]:
-1
In [3]:
s.find('w')
Out[3]:
6

Regex search

In [6]:
import re 
s = 'hello world :)'
re.search('w', s)
Out[6]:
<re.Match object; span=(6, 7), match='w'>

re.search return None if there is no match

In [7]:
re.search('m', s)
In [8]:
print(re.search('m', s))
None
In [37]:
char = 'm'
if re.search(char, s):
    print(char, 'found')
In [38]:
char = 'w'
if re.search(char, s):
    print(char, 'found')
w found
In [39]:
re.search('^w', s)

Regex findall method returns a list of results

In [41]:
x = 'My 2 favorite numbers are 19 and 42, 2019'
In [42]:
print(re.findall('[0-9]+', x))
['2', '19', '42', '2019']
In [10]:
print(re.findall('[A-C]+', x))
[]
In [25]:
x = 'My 2 favorite numbers are 19 and 42, 2019'
print(re.findall('[0-9]', x))
['2', '1', '9', '4', '2', '2', '0', '1', '9']
In [56]:
x = 'My 2 favorite numbers are 19 and 42, 2019'
print(re.findall('[0-7]+', x))
['2', '1', '42', '201']
In [11]:
x1 = 'My 2 favorite numbers are 19 and 42, 2019'
re.findall('[aeiou]+', x1)
Out[11]:
['a', 'o', 'i', 'e', 'u', 'e', 'a', 'e', 'a']
In [27]:
x2 = 'My 2 favorite numbers are 19 and 42, 2019 at beach'
re.findall('[aeiou]+', x2)
Out[27]:
['a', 'o', 'i', 'e', 'u', 'e', 'a', 'e', 'a', 'a', 'ea']
In [62]:
x = 'My 2 favorite numbers are 19 and 42, 2019'
l = re.findall('[aeiou]', x)
print(l)
['a', 'o', 'i', 'e', 'u', 'e', 'a', 'e', 'a']
In [58]:
x = 'My 2 favorite numbers are 19 and 42, 2019'
print(re.findall('[^aeiou]', x))
['M', 'y', ' ', '2', ' ', 'f', 'v', 'r', 't', ' ', 'n', 'm', 'b', 'r', 's', ' ', 'r', ' ', '1', '9', ' ', 'n', 'd', ' ', '4', '2', ',', ' ', '2', '0', '1', '9']
In [15]:
x = 'My 2 favorite numbers are 19 and 42, 2019'
print(re.findall('[^aeiou]+', x))
['My 2 f', 'v', 'r', 't', ' n', 'mb', 'rs ', 'r', ' 19 ', 'nd 42, 2019']
In [16]:
print(re.findall('[AEIOU]+', x))
[]
In [22]:
infile = open('mbox-short.txt')
import re
count = 0 
for line in infile:
    if re.search('^From:', line):
        #print(line)
        count += 1 
print('count', count)
count 27
In [23]:
infile = open('mbox-short.txt')
import re
count = 0 
for line in infile:
    if re.search('^X.*:', line):
        #print(line)
        count += 1 
print('count', count)
count 216

Greedy search

In [29]:
import re
x = 'From: Using the : character'
y = re.findall('^F.+:', x)
print(y)
['From: Using the :']

non-Greedy search

In [30]:
import re
x = 'From: Using the : character'
y = re.findall('^F.+?:', x)
print(y)
['From:']
In [28]:
line = 'From stephen.marquard@uct.ac.za Sat Jan  5 09:14:16 2008'
print(line.split()[1])
stephen.marquard@uct.ac.za

Extract email address using regex

In [2]:
import re
x = 'From msaad@iugaza.edu.ps Sat Jan  5 09:14:16 2018'
y = re.findall('\S+@\S+', x)
print(y)
['msaad@iugaza.edu.ps']
In [32]:
import re
x1 = 'From msaad@iugaza.edu.ps Sat Jan  5 09:14:16 2018'
x2 = 'my email address is m@google.com'
y = re.findall('^From (\S+@\S+)', x1)
print(y)
y = re.findall('^From (\S+@\S+)',x2)
print(y)
['msaad@iugaza.edu.ps']
[]

Extract domain name using regex

In [19]:
import re 
x = 'From msaad@iugaza.edu.ps Sat Jan  5 09:14:16 2008'
y = re.findall('@(\S+)', x)
print(y)
['iugaza.edu.ps']
In [21]:
import re 
x = 'From msaad@iugaza.edu.ps Sat Jan  5 09:14:16 2008'
y = re.findall('@([^ ]+)', x)
print(y)
['iugaza.edu.ps']
In [57]:
import re 
x = 'From msaad@iugaza.edu.ps Sat Jan  5 09:14:16 2008'
x2 = 'my email address is m@google.com'
y = re.findall('^From \S+@(\S+)', x)
print(y)
y = re.findall('^From \S+@(\S+)', x2)
print(y)
['iugaza.edu.ps']
[]
In [23]:
import re 
x = 'From msaad@iugaza.edu.ps Sat Jan  5 09:14:16 2008'
y = re.findall('^From \S+@([^ ]+)', x)
print(y)
print(y[0])
['iugaza.edu.ps']
iugaza.edu.ps

Get the Max and Min spam scores from emails

اكتب برنامج لاستخراج سبام سكور

spam score

من ملف الرسائل الالكترونية وايجاد اعلى وادنى قيمة للسبام

In [9]:
import re
# open file 
infile = open('mbox-short.txt')
# define empty list for spam score 
spam_scores = list()
# loop on each line 
for line in infile:
    line = line.rstrip() # remove \n 
    # match 
    score = re.findall('^X-DSPAM-Confidence: ([0-9.]+)', line)
#     print(score)
    if len(score) == 0: # empty list  
        continue
    num = float(score[0])
    spam_scores.append(num)

print(spam_scores)
print(len(spam_scores))
print('Maximum:', max(spam_scores))
print('Minimum:', min(spam_scores))
[0.8475, 0.6178, 0.6961, 0.7565, 0.7626, 0.7556, 0.7002, 0.7615, 0.7601, 0.7605, 0.6959, 0.7606, 0.7559, 0.7605, 0.6932, 0.7558, 0.6526, 0.6948, 0.6528, 0.7002, 0.7554, 0.6956, 0.6959, 0.7556, 0.9846, 0.8509, 0.9907]
27
Maximum: 0.9907
Minimum: 0.6178

find emails and their spam scores

اكتب برنامج لاستخراج السبام سكور والبريد الالكتروني من الترويسة من ملف الرسائل الالكترونية

In [10]:
import re
infile = open('mbox-short.txt').read()
scores = re.findall('^X-DSPAM-Confidence: ([0-9.]+)', infile, re.MULTILINE)
print(scores[:3]) # show sample score 
emails = re.findall('^From (\S+@\S+)', infile, re.MULTILINE)
print(emails[:3]) # show sample emails  
print(len(scores) == len(emails))
for email, score in zip(emails, scores):
    print('email: {}\tspam score: {}'.format(email, score))
['0.8475', '0.6178', '0.6961']
['stephen.marquard@uct.ac.za', 'louis@media.berkeley.edu', 'zqian@umich.edu']
True
email: stephen.marquard@uct.ac.za	spam score: 0.8475
email: louis@media.berkeley.edu	spam score: 0.6178
email: zqian@umich.edu	spam score: 0.6961
email: rjlowe@iupui.edu	spam score: 0.7565
email: zqian@umich.edu	spam score: 0.7626
email: rjlowe@iupui.edu	spam score: 0.7556
email: cwen@iupui.edu	spam score: 0.7002
email: cwen@iupui.edu	spam score: 0.7615
email: gsilver@umich.edu	spam score: 0.7601
email: gsilver@umich.edu	spam score: 0.7605
email: zqian@umich.edu	spam score: 0.6959
email: gsilver@umich.edu	spam score: 0.7606
email: wagnermr@iupui.edu	spam score: 0.7559
email: zqian@umich.edu	spam score: 0.7605
email: antranig@caret.cam.ac.uk	spam score: 0.6932
email: gopal.ramasammycook@gmail.com	spam score: 0.7558
email: david.horwitz@uct.ac.za	spam score: 0.6526
email: david.horwitz@uct.ac.za	spam score: 0.6948
email: david.horwitz@uct.ac.za	spam score: 0.6528
email: david.horwitz@uct.ac.za	spam score: 0.7002
email: stephen.marquard@uct.ac.za	spam score: 0.7554
email: louis@media.berkeley.edu	spam score: 0.6956
email: louis@media.berkeley.edu	spam score: 0.6959
email: ray@media.berkeley.edu	spam score: 0.7556
email: cwen@iupui.edu	spam score: 0.9846
email: cwen@iupui.edu	spam score: 0.8509
email: cwen@iupui.edu	spam score: 0.9907

Escape Character: Extract prices

price = $11.99

We just received $10.00 for cookies.

In [11]:
import re
x = 'We just received $10.00 for cookies.'
y = re.findall('\$[0-9.]+',x)
print(y)
['$10.00']
In [13]:
import re
x = 'price = $11.99'
y = re.findall('\$[0-9.]+',x)
print(y)
['$11.99']

find all prices in a text

اكتب برنامج لاستخراج كل الاسعار من نص

In [11]:
import re
text = '''
I bought a laptop for $545.00  :) 
I bought milk for $1.50 only 
The price of the piece is $12.60 incuding VAT
I go to the university everyday but thursday and friday
'''
money = re.findall('\$[0-9.]+', text, re.MULTILINE)
print(money)
['$545.00', '$1.50', '$12.60']

Find Money in text :)

print the line that contain a price

اكتب برنامج لطباعة الاسطر التي تحتوي على سعر

In [15]:
import re
text = '''
I bought a laptop for $545.00  :) 
I bought milk for $1.50 only 
The price of the piece is $12.60 incuding VAT
I go to the university everyday but thursday
'''
for line in text.split('\n'):
    if re.search('\$[0-9.]+', line):
#         print(re.search('\$[0-9.]+', line))
        print(line)
I bought a laptop for $545.00  :) 
I bought milk for $1.50 only 
The price of the piece is $12.60 incuding VAT
In [24]:
import re
text = '''
I bought a laptop for $545.00  :) 
I bought milk for $1.50 only 
The price of the piece is $12.60 incuding VAT
I go to the university everyday but thursday
'''
for line in text.split('\n'):
    if re.search('for \$[0-9.]+', line):
        print(re.search('for \$[0-9.]+', line))
        print(line)
<re.Match object; span=(18, 29), match='for $545.00'>
I bought a laptop for $545.00  :) 
<re.Match object; span=(14, 23), match='for $1.50'>
I bought milk for $1.50 only 
In [25]:
import re
text = '''
I bought a laptop for $545.00  :) 
I bought milk for $1.50 only 
The price of the piece is $12.60 incuding VAT
I go to the university everyday but thursday
'''
for line in text.split('\n'):
    if re.search('for (\$[0-9.]+)', line):
        print(re.search('for (\$[0-9.]+)', line))
        print(line)
<re.Match object; span=(18, 29), match='for $545.00'>
I bought a laptop for $545.00  :) 
<re.Match object; span=(14, 23), match='for $1.50'>
I bought milk for $1.50 only 
In [ ]: