The lab provides practice with generators, strings and regular expressions.
Instructions:
Modify this notebook by adding the Python code described below.
Test your code using the menu item Cell ► Run All
Save the notebook (the .ipynb file) and upload it to the appropriate Assignment folder on the course web site.
Write a generator function, firstn(ls,ln)
that returns an iterator. ls
is a list of strings and ln
is a list of integers. The iterator returns successive strings from ls
truncated to the number of characters given by the corresponding element in ln
. For example if g=firstn(['abc','defg','hijkl'],[1,2,3])
then successive calls to next(g)
would return "a"
, "de"
,"hij"
and list(g)
would return ['a', 'de', 'hij']
. You can assume that ls
and ln
are the same length.
def firstn(ls,ln):
for s,n in zip(ls,ln):
yield s[:n]
ls = ['abc','defg','hijkl']
ln = [1,2,3]
g=firstn(ls,ln)
list(g)
['a', 'de', 'hij']
Write a function wordfreq(s)
that returns a list of the number of occurences of each word in s
. Each element of the returned list is a tuple. The first element of each tuple is the word in lower case and the second is the number of times it occurs in s
. Split s
into words by anything that is not a letter. Ignore capitalization. Order the list in alphabetical order.
For example, if s = "abc123!xyz 123 The QuIcK-abc*xyz. XYZ"
should return [('abc', 2), ('quick', 1), ('the', 1), ('xyz', 3)]
Hints: (1) str.lower(). (2) re.split() or str.isalpha() and str.join(). (3) dict() or count().
s = "abc123!xyz 123 The QuIcK-abc*xyz. XYZ"
def wordfreq(s):
import re
l = re.split(r'[^a-z]+',s.lower())
return sorted([(w,l.count(w)) for w in set(l)])
print(wordfreq(s))
# or
def wordfreq(s):
import re
f = {}
for w in re.split(r'[^a-z]+',s.lower()):
f[w] = f[w]+1 if w in f else 1
return [(k,f[k]) for k in sorted(f)]
# or
def wordfreq(s):
f={}
for s in (''.join([c if c.isalpha() else ' ' for c in s.lower()])).split():
if s not in f:
f[s] = 0
f[s] += 1
return [(k,f[k]) for k in sorted(f.keys())]
[('abc', 2), ('quick', 1), ('the', 1), ('xyz', 3)]
Write a function abbrev(l)
that abbreviates each string in the list l
by removing all vowels except the first. Vowels are a
, e
, i
, o
, u
, and y
. You can assume each word contains at least one vowel and one non-vowel and all letters are lower-case. For example, if l=['four', 'brown', 'foxes', 'and', 'a', 'lazy', 'dog']
then abbrev(l)
returns ['for', 'brown', 'foxs', 'and', 'a', 'laz', 'dog']
Hint: One possible implementation is to use re.split()
to: split the word into three strings at the first vowel (using maxsplit=1
and parentheses in the regular expression). Then remove vowels from the third string using re.sub()
and recombine the three strings.
import re
l = "four brown foxes and a lazy dog".split()
print(l)
def abbrev(l):
o = []
for s in l:
a,b,c = re.split(r'([aeiouy])',s,maxsplit=1)
o.append(a+b+re.sub(r'[aeiouy]','',c))
return o
abbrev(l)
['four', 'brown', 'foxes', 'and', 'a', 'lazy', 'dog']
['for', 'brown', 'foxs', 'and', 'a', 'laz', 'dog']
Set the following string variables to the corresponding regular expressions:
re_digit
to a regular expression that exactly matches the string 'one', 'two' or 'three'. For example, it would match the strings 'one' or 'three' but not 'tone' or 'twotwo' or 'threes'.re_base8
to a regular expression that exactly matches one or more base-6 digits (those between 0 and 5). For example, it would match '0' or '421' but not '' (a null string), '16' or '-10'.re_id
to a regular expression that exactly matches strings that begin with between one and three upper-case letters followed by three to five digits. For example, it would match 'A123' and 'XYZ00000' but not 'ab123', 'AAAA123' or 'AB12' or 'AB000123'.For example: re_abc=r"^abc"
would set the variable re_abc
to a regular expression that matches strings beginning with "abc"
.
import re
re_digit = r'^(one|two|three)$'
print([re.match(re_digit,s) for s in ['one', 'three', 'tone', 'twotwo', 'threes']])
re_base6 = r'^[0-5]+$'
print([re.match(re_base6,s) for s in ['0' , '421' , '' , '16' , '-10']])
re_id = r'^[A-Z]{1,3}[0-9]{3,5}$'
print([re.match(re_id,s) for s in ['A123' , 'XYZ00000' , 'ab123', 'AAAA123' , 'AB12' , 'AB000123']])
[<re.Match object; span=(0, 3), match='one'>, <re.Match object; span=(0, 5), match='three'>, None, None, None] [<re.Match object; span=(0, 1), match='0'>, <re.Match object; span=(0, 3), match='421'>, None, None, None] [<re.Match object; span=(0, 4), match='A123'>, <re.Match object; span=(0, 8), match='XYZ00000'>, None, None, None, None]
Write a function fixdup(s)
that replaces duplicated words in the string s
with a single instance of the word and returns the string. The words are separated by a single space but can be in any case.
For example "The the", "The the " and "The the the" would all be replaced by "The" but "a b" would be unchanged.
Hint: split() and join(); str,lower().
import re
def fixdup(s):
l = s.split()
return ' '.join([w for i,w in enumerate(l) if not i or w.lower() != l[i-1].lower()])
for s in ["The the", "The the " , "The the the", "a\nb"]:
print(repr(fixdup(s)))
'The' 'The' 'The' 'a b'
# lab validation code; do not modify
def labcheck():
import copy, random, re, string, types
from random import randint
def checkre(pat,ok,nok):
for s in ok:
assert re.fullmatch(pat,s), \
f"pattern '{pat}'\n did not match string '{s}'"
for s in nok:
assert not re.fullmatch(pat,s), \
f"pattern '{pat}'\n matched string '{s}'"
def randwords(n,chars=string.ascii_lowercase,nl=(2,5)):
l = set()
while len(l)<n:
l |= set((''.join([chars[randint(0,len(chars)-1)] for i in range(randint(*nl))]),))
return list(l)
def q1():
n=randint(3,5)
ls=randwords(n)
ln=[randint(1,n) for n in map(len,ls)]
g=firstn(ls,ln)
l=list(g)
#print(n,ls,ln,l)
assert isinstance(firstn,types.FunctionType), \
f"firstn() has type {type(firstn)}"
assert isinstance(g, types.GeneratorType), \
f"return from firstn() has type {type(g)}"
assert all([len(l[i]) == ln[i] and ls[i].startswith(l[i]) for i in range(len(l))]), \
f"firstn({n},'{s}'') returns{l}"
def q2():
chars = string.ascii_lowercase
joins = 3*[' ', ' ', ' ']+list("!&*012345")+"--,$$$,::,<<<,>>".split(',')
n = randint(5,8)
l = sorted(randwords(n))
f = [[1,1,1,2,2,3][randint(0,5)] for i in range(n)]
xl = []
for i in range(n):
xl.extend([l[i]]*f[i])
random.shuffle(xl)
s=''.join([xl[i]+random.choice(joins) for i in range(len(xl))])
s=''.join([c.upper() if not randint(0,4) else c for c in s])
rf = wordfreq(s)
cf = list(zip(l,f))
# print(n,s,rf,cf,sep='\n')
assert rf == cf, f"wordfreq('{s}') returns {rf} instead of {cf}"
def q3():
t=False
while not t:
l=randwords(randint(5,10),"bdlnaeiouy",(4,6))
t=all([len([c for c in w if c in "aeiouy"])>0 for w in l]) and \
all ([len([c for c in w if c in "bdln"])>0 for w in l])
#print(l,t)
ol=copy.deepcopy(l)
rl=abbrev(l)
assert all([len(re.findall(r'[aeiouy]',s)) == 1 for s in rl]) and \
[re.findall(r'[aeiouy]',s)[0] for s in ol] == [re.findall(r'[aeiouy]',s)[0] for s in rl], \
f"abbrev('{ol}') returned {rl}"
def q4():
checkre(re_digit,
['one', 'two', 'three'],
[' one', 'twotwo', 'threes'])
checkre(re_base6,
['0' , '421', '12354' ],
['' , '16' , '-10', '7'])
checkre(re_id,
['A123' , 'XYZ00000' , ],
['ab123', 'AAAA123' , 'AB12' , 'AB000123'])
def q5():
ok=["The the ", " the the", "The the the"]
nok=["a b"]
for s in ok:
r = fixdup(s)
assert r == s.split()[0], f"fixdup({repr(s)} returns {repr(r)})"
for s in nok:
r = fixdup(s)
assert r == s, f"fixdup({repr(s)} returns {repr(r)})"
for s,i in [(s,s[1:]) for s in locals().keys() if re.search(r'q\d+',s)]:
try:
locals()[s]()
print(f"Question {i} OK.")
except Exception as e:
print(f"Failed check for Question {i}: {e}")
labcheck()
Question 1 OK. Question 2 OK. Question 3 OK. Question 4 OK. Question 5 OK.