MLEB/Translate/src/PageTranslation/TranslatablePageParser.php


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216

<?php
declare( strict_types = 1 );

namespace MediaWiki\Extension\Translate\PageTranslation;

use MediaWiki\Extension\Translate\Utilities\ParsingPlaceholderFactory;

/**
 * Generates ParserOutput from text or removes all tags from a text.
 *
 * @author Niklas Laxström
 * @license GPL-2.0-or-later
 * @since 2020.08
 */
class TranslatablePageParser {
	private $placeholderFactory;

	public function __construct( ParsingPlaceholderFactory $placeholderFactory ) {
		$this->placeholderFactory = $placeholderFactory;
	}

	public function containsMarkup( string $text ): bool {
		$nowiki = [];
		$text = $this->armourNowiki( $nowiki, $text );
		return preg_match( '~</?translate[ >]~', $text ) !== 0;
	}

	/**
	 * Remove all opening and closing translate tags following the same whitespace rules as the
	 * regular parsing. This doesn't try to parse the page, so it can handle unbalanced tags.
	 */
	public function cleanupTags( string $text ): string {
		$nowiki = [];
		$text = $this->armourNowiki( $nowiki, $text );
		$text = preg_replace( '~<translate( nowrap)?>\n?~s', '', $text );
		$text = preg_replace( '~\n?</translate>~s', '', $text );
		// Markers: headers and the rest
		$ic = preg_quote( TranslationUnit::UNIT_MARKER_INVALID_CHARS, '~' );
		$text = preg_replace( "~(^=.*=) <!--T:[^$ic]+-->$~um", '\1', $text );
		$text = preg_replace( "~<!--T:[^$ic]+-->[\n ]?~um", '', $text );
		// Remove variables
		$unit = new TranslationUnit( $text );
		$text = $unit->getTextForTrans();

		$text = $this->unarmourNowiki( $nowiki, $text );
		return $text;
	}

	/** @throws ParsingFailure */
	public function parse( string $text ): ParserOutput {
		$nowiki = [];
		$text = $this->armourNowiki( $nowiki, $text );

		$sections = [];
		$tagPlaceHolders = [];

		while ( true ) {
			$re = '~(<translate(?: nowrap)?>)(.*?)</translate>~s';
			$matches = [];
			$ok = preg_match( $re, $text, $matches, PREG_OFFSET_CAPTURE );

			if ( $ok === 0 || $ok === false ) {
				break; // No match or failure
			}

			$contentWithTags = $matches[0][0];
			$contentWithoutTags = $matches[2][0];
			// These are offsets to the content inside the tags in $text
			$offsetStart = $matches[0][1];
			$offsetEnd = $offsetStart + strlen( $contentWithTags );

			// Replace the whole match with a placeholder
			$ph = $this->placeholderFactory->make();
			$text = substr( $text, 0, $offsetStart ) . $ph . substr( $text, $offsetEnd );

			if ( preg_match( '~<translate( nowrap)?>~', $contentWithoutTags ) !== 0 ) {
				throw new ParsingFailure(
					'Nested tags',
					[ 'pt-parse-nested', $contentWithoutTags ]
				);
			}

			$openTag = $matches[1][0];
			$canWrap = $openTag !== '<translate nowrap>';

			// Parse the content inside the tags
			$contentWithoutTags = $this->unarmourNowiki( $nowiki, $contentWithoutTags );
			$parse = $this->parseSection( $contentWithoutTags, $canWrap );

			// Update list of sections and the template with the results
			$sections += $parse['sections'];
			$tagPlaceHolders[$ph] = new Section( $openTag, $parse['template'], '</translate>' );
		}

		$prettyTemplate = $text;
		foreach ( $tagPlaceHolders as $ph => $value ) {
			$prettyTemplate = str_replace( $ph, '[...]', $prettyTemplate );
		}

		if ( preg_match( '~<translate( nowrap)?>~', $text ) !== 0 ) {
			throw new ParsingFailure(
				'Unmatched opening tag',
				[ 'pt-parse-open', $prettyTemplate ]
			);
		} elseif ( strpos( $text, '</translate>' ) !== false ) {
			throw new ParsingFailure(
				"Unmatched closing tag",
				[ 'pt-parse-close', $prettyTemplate ]
			);
		}

		$text = $this->unarmourNowiki( $nowiki, $text );

		return new ParserOutput( $text, $tagPlaceHolders, $sections );
	}

	/**
	 * Splits the content marked with \<translate> tags into translation units, which are
	 * separated with two or more newlines. Extra whitespace is captured in the template and
	 * is not included in the translation units.
	 * @internal
	 */
	public function parseSection( string $text, bool $canWrap ): array {
		$flags = PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE;
		$parts = preg_split( '~(^\s*|\s*\n\n\s*|\s*$)~', $text, -1, $flags );

		$inline = preg_match( '~\n~', $text ) === 0;

		$template = '';
		$sections = [];

		foreach ( $parts as $_ ) {
			if ( trim( $_ ) === '' ) {
				$template .= $_;
			} else {
				$ph = $this->placeholderFactory->make();
				$tpsection = $this->parseUnit( $_ );
				$tpsection->setIsInline( $inline );
				$tpsection->setCanWrap( $canWrap );
				$sections[$ph] = $tpsection;
				$template .= $ph;
			}
		}

		return [
			'template' => $template,
			'sections' => $sections,
		];
	}

	/**
	 * Checks if this unit already contains a section marker. If there
	 * is not, a new one will be created. Marker will have the value of
	 * -1, which will later be replaced with a real value.
	 * @internal
	 */
	public function parseUnit( string $content ): TranslationUnit {
		$re = '~<!--T:(.*?)-->~';
		$matches = [];
		$count = preg_match_all( $re, $content, $matches, PREG_SET_ORDER );

		if ( $count > 1 ) {
			throw new ParsingFailure(
				'Multiple translation unit markers',
				[ 'pt-shake-multiple', $content ]
			);
		}

		// If no id given in the source, default to a new section id
		$id = TranslationUnit::NEW_UNIT_ID;
		if ( $count === 1 ) {
			foreach ( $matches as $match ) {
				[ /*full*/, $id ] = $match;

				// Currently handle only these two standard places.
				// Is this too strict?
				$rer1 = '~^<!--T:(.*?)-->( |\n)~'; // Normal sections
				$rer2 = '~\s*<!--T:(.*?)-->$~m'; // Sections with title
				$content = preg_replace( $rer1, '', $content );
				$content = preg_replace( $rer2, '', $content );

				if ( preg_match( $re, $content ) === 1 ) {
					throw new ParsingFailure(
						'Translation unit marker is in unsupported position',
						[ 'pt-shake-position', $content ]
					);
				} elseif ( trim( $content ) === '' ) {
					throw new ParsingFailure(
						'Translation unit has no content besides marker',
						[ 'pt-shake-empty', $id ]
					);
				}
			}
		}

		return new TranslationUnit( $content, $id );
	}

	/** @internal */
	public function armourNowiki( array &$holders, string $text ): string {
		$re = '~(<nowiki>)(.*?)(</nowiki>)~s';

		while ( preg_match( $re, $text, $matches ) ) {
			$ph = $this->placeholderFactory->make();
			$text = str_replace( $matches[0], $ph, $text );
			$holders[$ph] = $matches[0];
		}

		return $text;
	}

	/** @internal */
	public function unarmourNowiki( array $holders, string $text ): string {
		return strtr( $text, $holders );
	}
}