MediaWiki:Gadget-PageCleanUp.js
Let op! Nadat u de veranderingen heeft opgeslagen, moet u de cache van uw browser nog legen om ze daadwerkelijk te zien.
Mozilla (incl. Firefox) | Ctrl+Shift+R |
Internet Explorer | Ctrl+F5 |
Opera | F5 |
Safari | Cmd+R |
Konqueror | F5 |
/*jshint boss:true*/
/*global $, mw*/
// Oorspronkelijk gekopieerd van [[en:User:Samwilson/PageCleanUp.js]]
/**
* This script adds a toolbar button for cleaning up the OCR text.
*/
( function ( mw, $ ) {
function cleanUp( text ) {
text = text
// Start by trimming leading and trailing whitespace.
.trim()
// remove trailing spaces at the end of each line
.replace(/ +\n/g, '\n')
// remove trailing whitespace preceding a hard line break
.replace(/ +<br *\/?>/g, '<br />')
// remove trailing whitespace and numerals at the end of page text
// (numerals are nearly always page numbers in the footer)
.replace(/[\s\d]+$/g, '')
// remove trailing spaces at the end of refs
.replace(/ +<\/ref>/g, '</ref>')
// remove trailing spaces at the end of template calls
.replace(/ +}}/g, '}}')
// convert double-hyphen to mdash (avoiding breaking HTML comment syntax)
.replace(/([^\!])--([^>])/g, '$1—$2')
// Replace double-em-dash with a two-em bar.
.replace(/——/g, '{{bar|2}}')
.replace( /\s*—\s*/g, ' — ' ) // altijd één spatie aan beide kanten
.replace( /\s*–\s*/g, '–' ) // en dash zonder spaties
// join words that are hyphenated across a line break
// (but leave "|-" table syntax alone)
.replace(/([^\|])-\n/g, '$1');
// clean up pages if they don't have <poem>
if ( text.indexOf( "<poem>" ) === -1 ) {
text = text
// lines that start with " should probably be new lines,
// if the previous line ends in punctuation,
// other than a comma or semicolon
// and let's get rid of trailing space while we're at it
.replace(/([^\n\w,;])\n\" */g, '$1\n\n"')
// lines that end with " should probably precede a new line,
// unless preceded by a comma,
// or unless the new line starts with a lower-case letter;
// and let's get rid of preceding space while we're at it
.replace(/([^,])\ *\"\n([^a-z\n])/g, '$1"\n\n$2')
// remove single line breaks; preserve multiple.
// but not if there's a tag, template or table syntax either side of the line break
.replace(/([^>}\|\n])\n([^:#\*<{\|\n])/g, '$1 $2')
// collapse sequences of spaces into a single space
.replace(/ +/g, ' ');
}
// more page cleanup
text = text
// dump spurious hard breaks at the end of paragraphs
.replace(/<br *\/?>\n\n/g, '\n\n')
// remove unwanted spaces around punctuation marks
.replace(/ ([;:\?!,.])/g, '$1')
// unicodify
.replace(/—/g, '—')
.replace(/–/g, '–')
.replace(/"/g, '"')
// straighten quotes and apostrophes.
.replace(/[“”]/g, '"')
.replace(/[‘’`]/g, '\'')
//OCR fixes
// convert i9 to 19, etc.
.replace(/[il]([0-9])/g, '1$1')
// "the", "them", "their", etcetera
.replace(/tlie/g, 'the')
// "U" -> "ll" when preceded by a lowercase letter.
.replace(/([a-z])U/g, '$1ll')
// "would", "could"
.replace(/woidd/g, 'would')
.replace(/coidd/g, 'could')
.replace(/shoidd/g, 'should')
// many works have apostrophes missing from OCR
.replace(/([a-z]) s\b/g, '$1\'s') // it's he's etc
.replace(/n t\b/g, 'n\'t') //can't isn't didn't etc
.replace(/([a-zI]) ll\b/g, '$1\'ll') // I'll we'll etc
.replace(/\bI m\b/g, 'I\'m') // I'm
.replace(/\b([Yy])ou re\b/g, '$1ou\'re') // you're
.replace(/\b([Ww])e re\b/g, '$1e\'re') // we're
.replace(/\b([Tt])hey re\b/g, '$1hey\'re') // they're
.replace(/([a-zI]) ve\b/g, '$1\'ve') // I've we've etc
// expand diacritical templates
.replace(/{{((ae|oe|\w[:`'~^-]))}}/g, '{{subst'+':$1}}')
// replace "float center" with "block center"; original template name was misleading enough be warrant routinely fixing
.replace(/\{\{float center/g, '{{block center')
// Center tags are converted to the {{center}} template.
.replace(/<center>\s*([.\n]*?)\s*<\/center>/g, '{{center|$1}}')
// Full stop followed by a lower case letter should probably be a comma.
.replace(/\.(\s[a-z])/g, ',$1')
// A comma followed by a capital letter other than "I" should probably be a full stop.
.replace(/,(\s[A-HJ-Z])/g, '.$1')
// Remove unwanted ligatures.
.replace(/fi/, 'fi')
.replace(/fl/, 'fl')
;
return text;
}
mw.hook( 'wikiEditor.toolbarReady' ).add( function ( $textarea ) {
$textarea.wikiEditor( 'addToToolbar', {
section: 'main',
group: 'format',
tools: {
'Samwilson-PageCleanUp': {
label: 'Page clean-up',
type: 'button',
icon: 'https://upload.wikimedia.org/wikipedia/commons/thumb/1/15/Text-x-generic-apply.svg/22px-Text-x-generic-apply.svg.png',
action: {
type: 'callback',
execute: function () {
$textarea.val( cleanUp( $textarea.val() ) );
}
}
}
}
} );
} );
}( mediaWiki, jQuery ) );