#!/usr/bin/python
#coding:utf-8
#
# **PT_BR***********************************************************************
# Dado um arquivo com endereços de páginas da Internet e uma opção que indica o
# idioma, o script substitui todos os artigos determinados por sublinhados,
# para que o texto possa ser impresso como atividade extra no estudo dos
# substantivos femininos e masculinos.
# Data: 30/06/2010
# ******************************************************************************
#
# **EN*************************************************************************
# Given a file with URLs and an option indicating the target language
# this script replaces determinate articles in that language in the specified
# pages with underscores so the text can be printed as an extra exercise for
# the study of feminine and masculine nouns.
# Date: 06/30/2010
# ******************************************************************************
#
#
# Artigos definidos & suas expressões regulares
#
# pt,pt_BR : o, a, os, as
#
# fr: la, le, las, les, l'
#
# it: la, lo, le, l', il, i, gli
#
# de: der, das, die
#
# es: el, los, la, las
#
LANG = { 'de': r"\b(?:[dD][eE][rR]|[dD][aA][sS]|[dD][iI][eE])\b",
'es': r"\b(?:[eE]l|[lL][oO][sS]|[lL][aA][sS]?)\b",
'fr': r"(?:\b[lL][aA][sS]?\b|\b[lL][eE][sS]?\b|[lL]')",
'it': r"\b(?:[lL](?:[aA]|[oO]|[eE])|[lL]'|[iI][lL]?|[gG][lL][iI])\b",
'pt': r"\b[oOaA][sS]?\b"
}
# Elements to be removed from the page:
# links
# videos
LINK_REGEX = r"<a href=\".+\">(.+)</a>"
REMOVE_REGEX = [r"<object .+>", r"<param .+>", r"<embed .+>"]
lang_codes = ''
for code in LANG.keys():
lang_codes += code + ("|","")[LANG.keys().index(code) == len(LANG.keys()) - 1]
from optparse import OptionParser
parser = OptionParser(usage="%prog: -f FILE -l " + lang_codes)
parser.add_option("-l", "--language", dest="language", help="select target language", metavar="TARGET_LANG")
parser.add_option("-f", "--file", dest="filename", help="select FILE with URL addresses to fetch", metavar="FILE")
(options, args) = parser.parse_args()
if options.language is None or options.filename is None or options.language not in LANG.keys():
parser.print_help()
else:
url_list = list()
with open(options.filename, 'r') as url_file:
url_list = map(lambda element: element.replace('\n', ''), url_file.readlines())
import urllib2
exercise_counter = 0
for url in url_list:
page = urllib2.urlopen(url).read()
# Remove undesired elements from page
new_page = urllib2.re.sub(LINK_REGEX, lambda m: "%s" % m.group(1),page)
for und in REMOVE_REGEX:
new_page = urllib2.re.sub(und, "", new_page)
new_page = urllib2.re.sub(LANG[options.language], "____", new_page)
# Writing page to disk
with open('exercicio_' + options.language + "_" + str(exercise_counter), 'w+') as out_file:
out_file.write(new_page)
exercise_counter += 1
Mas testando com uma página do Le Figaro, palavras como rôle acabam ficando como rô___. Mesmo com os boundaries. Para os 'usuários finais' e gente sem Python no PC com o OpenOffice/BROffice, basta fazer o seguinte:
- Cole o texto
- Clique em Editar > Localizar e Substituir
- Clique em Mais opções
- Marque Expressões regulares
- Na caixa Procurar por digite \<[Ll][eE][sS]?\>|\<[Ll][aA][sS]?\>|[Ll]' e emSubstituir por, ____
