一个可以将html转为moin的脚本

Python/PHP/Perl 开发与设计
回复
头像
oneleaf
论坛管理员
帖子: 10230
注册时间: 2005-03-27 0:06
系统: Ubuntu 12.04
送出感谢: 7 次
接收感谢: 103 次

一个可以将html转为moin的脚本

#1

帖子 oneleaf » 2006-11-27 16:37

代码: 全选

#!/usr/bin/python
# -*- coding: utf-8 -*- 
import re

#return format 0:'' ,1:r1+s1+self+s2+r2 2:r1+self+r2 3:r1+r2
def replace(strs,searchfirststr,searchendstr,firststr='',endstr='',formatmod=0):
    prog=re.compile(searchfirststr+'(.*?)'+searchendstr,re.DOTALL)
    m = prog.findall(strs)
    rstr=strs
    if m:
        for mstr in m:
            sourcestr=searchfirststr+mstr+searchendstr
            repalcestr=''
            if formatmod==0: 
                repalcestr=''
            elif formatmod==1: 
                repalcestr=firststr+searchfirststr+mstr.strip()+searchendstr+endstr
            elif formatmod==2: 
                repalcestr=firststr+mstr.strip()+endstr
            elif formatmod==3: 
                repalcestr=firststr+endstr
            rstr=rstr.replace(sourcestr,repalcestr)
    return rstr


 #将<a href="/wiki/User:Jiyuu0" title="User:Jiyuu0">Chua Wen Kiat</a>
def replaceuserhref(strs):
    prog=re.compile('<a href="/wiki/User:(.*?)</a>',re.DOTALL)
    m = prog.findall(strs)
    rstr=strs
    if m:
        for mstr in m:
            sourcestr='<a href="/wiki/User:'+mstr+'</a>'
            n=re.search('>(.*?)</a>',sourcestr)
            if n:
                repalcestr=n.group(1)
            else:
                repalcestr='.' 
            rstr=rstr.replace(sourcestr,repalcestr)
    return rstr


#<a href="http://linux.edu.lv/index.php?newlang=english" class="external text" title="http://linux.edu.lv/index.php?newlang=english" rel="nofollow">Linux Center</a>
def replacehref(strs):
    prog=re.compile('<a href="(.*?)</a>',re.DOTALL)
    m = prog.findall(strs)
    rstr=strs
    if m:
        for mstr in m:
            sourcestr='<a href="'+mstr+'</a>'
            n=re.search('href="(.*?)" ',sourcestr)
            if n:
                url=n.group(1)
            else:
                url='.' 
            n=re.search('>(.*?)</a>',sourcestr)
            if n:
                title=n.group(1)
            else:
                title=url
            if url.find(':')>0:
                repalcestr='['+url+' '+title+']'
            else:
                repalcestr='[:'+url+':'+title+']'
            rstr=rstr.replace(sourcestr,repalcestr)
    return rstr

#<ul><li>Save the edited file</li></ul>
def replaceul(strs):
    prog=re.compile('<ul>(.*?)</ul>',re.DOTALL)
    m = prog.findall(strs)
    rstr=strs
    if m:
        for mstr in m:
            sourcestr='<ul>'+mstr+'</ul>'
            repalcestr=''
            mprog=re.compile('<li>(.*?)</li>',re.DOTALL)
            n=mprog.findall(sourcestr)
            if n:
                for nstr in n:
                    repalcestr=repalcestr+'\n'+' 1. '+nstr
            rstr=rstr.replace(sourcestr,repalcestr)
    return rstr

#<ol><li>Save the edited file</li></ol>
def replaceol(strs):
    prog=re.compile('<ol>(.*?)</ol>',re.DOTALL)
    m = prog.findall(strs)
    rstr=strs
    if m:
        for mstr in m:
            sourcestr='<ol>'+mstr+'</ol>'
            repalcestr=''
            mprog=re.compile('<li>(.*?)</li>',re.DOTALL)
            n=mprog.findall(sourcestr)
            if n:
                for nstr in n:
                    repalcestr=repalcestr+'\n'+' * '+nstr
            rstr=rstr.replace(sourcestr,repalcestr)
    return rstr

#<pre><a href=""></a></pre>
def replacepre(strs):
    prog=re.compile('<pre>(.*?)</pre>',re.DOTALL)
    m = prog.findall(strs)
    rstr=strs
    if m:
        for mstr in m:
            sourcestr='<pre>'+mstr+'</pre>'
            mprog=re.compile('<a href=(.*?)</a>',re.DOTALL)
            n=mprog.findall(mstr)
            repalcestr=mstr
            if n:
                for nstr in n:
                    source='<a href='+nstr+'</a>'
                    l=re.search('href="(.*?)" ',source)
                    if l:
                        url=l.group(1)
                    else:
                        url='.' 
                    l=re.search('>(.*?)</a>',source)
                    if l:
                        title=l.group(1)
                    else:
                        title=url
                    repalcestr=repalcestr.replace(source,title)
            repalces=''
            lines=repalcestr.split('\n')
            for line in lines:
                if line.strip()<>'':
                    repalces=repalces+line.strip()+'\n'
            rstr=rstr.replace(sourcestr,' {{{'+repalces+'}}}')
    return rstr

if __name__ == "__main__": 
    f=open('html.txt', 'r+w')
    lines=''
    sreturn='\n'
    for line in f.readlines():
        if line.strip()<>'':
            lines=lines+sreturn+line.strip()

           #删除 <div class="editsection" * </div> 语句
    lines=replace(lines,'<div class="editsection"','</div>','','',0)

            #将 <a name="Getting_Started"></a> 语句 转为 [[HTML(<a name="Getting_Started"></a>)]]
    lines=replace(lines,'<a name=','</a>','[[HTML(',')]]',1)

            #将 <h1>xxx</h1> 语句 转为 = =
    lines=replace(lines,'<h1>','</h1>',sreturn+'= ',' ='+sreturn,2)

            #将 <h2>xxx</h2> 语句 转为 == ==
    lines=replace(lines,'<h2>','</h2>',sreturn+'== ',' =='+sreturn,2)

            #将 <h3>xxx</h3> 语句 转为 === ===
    lines=replace(lines,'<h3>','</h3>',sreturn+'=== ',' ==='+sreturn,2)

            #将 <h4>xxx</h4> 语句 转为 ==== ====
    lines=replace(lines,'<h4>','</h4>',sreturn+'==== ',' ===='+sreturn,2)

            #将 <h5>xxx</h5> 语句 转为 ===== =====
    lines=replace(lines,'<h5>','</h5>',sreturn+'===== ',' ====='+sreturn,2)

            #将 <b>xxx</b> 语句 转为 ''' '''
    lines=replace(lines,'<b>','</b>','\'\'\'','\'\'\'',2)

            #将 <p>xxx</p> 语句 转为 ''' '''
    lines=replace(lines,'<p>','</p>','',''+sreturn,2)

            #将 <dl>xxx</dl> 语句 转为 ''' '''
    lines=replace(lines,'<dl>','</dl>','','',2)

            #将 <dd>xxx</dd> 语句 转为 ''' '''
    lines=replace(lines,'<dd>','</dd>','','',2)

            #将 <pre>xxx</pre> 语句 转为 ''' {{{ }}}'''
    lines=replacepre(lines)

            #将 <i>xxx</i> 语句 转为 ''' '' ''''
    lines=replace(lines,'<i>','</i>','\'\'','\'\'',2)

           #将 <br> 语句 转为 ''' \n''
    lines=replace(lines,'<br>','',sreturn,'',3)

            #替换用户链接
    lines=replaceuserhref(lines)

           #替换链接地址
    lines=replacehref(lines)

           #替换ul
    lines=replaceul(lines)

           #替换ol
    lines=replaceol(lines)

             #替换> 为 >
    lines=lines.replace('>','>')

            #替换[:# 为 [:EdgyGuide#
    lines=lines.replace('[:#',' [:EdgyGuide#')

    #lines=line.replace(sreturn,'\n')
    f=open('a.txt', 'w')
    f.write(lines)
    f.flush()
    f.close()
回复

回到 “Python/Php/Perl”