其他
深入理解 WKWebView (渲染篇) —— DOM 树的构建
全文12003字,预计阅读时间24分钟
当客户端 App 主进程创建 WKWebView 对象时,会创建另外两个子进程:渲染进程与网络进程。主进程 WKWebView 发起请求时,先将请求转发给渲染进程,渲染进程再转发给网络进程,网络进程请求服务器。如果请求的是一个网页,网络进程会将服务器的响应数据 HTML 文件字符流吐给渲染进程。渲染进程拿到 HTML 文件字符流,首先要进行解析,将 HTML 文件字符流转换成 DOM 树,然后在 DOM 树的基础上,进行渲染操作,也就是布局、绘制。最后渲染进程通知主进程 WKWebView 创建对应的 View 展现视图。整个流程如下图所示:
class HTMLDocument : public Document { // 继承自 Document
...
WEBCORE_EXPORT int width();
WEBCORE_EXPORT int height();
...
}
class Document
: public ContainerNode // Document继承自 ContainerNode,ContainerNode继承自Node
, public TreeScope
, public ScriptExecutionContext
, public FontSelectorClient
, public FrameDestructionObserver
, public Supplementable<Document>
, public Logger::Observer
, public CanvasObserver {
WEBCORE_EXPORT ExceptionOr<Ref<Element>> createElementForBindings(const AtomString& tagName); // 创建Element的方法
WEBCORE_EXPORT Ref<Text> createTextNode(const String& data); // 创建文本节点的方法
WEBCORE_EXPORT Ref<Comment> createComment(const String& data); // 创建注释的方法
WEBCORE_EXPORT Ref<Element> createElement(const QualifiedName&, bool createdByParser); // 创建Element方法
....
}
document.childNodes; // 返回子Node集合,返回DocumentType与HTML节点,都继承自Node
document.children; // 返回子Element集合,只返回HTML节点,DocumentType不继承自Element
二、DOM树构建
2.1 解码
2.1.1 解码类图
2.1.2 解码流程
// HTMLDocumentParser是DecodedDataDocumentParser的子类
void DecodedDataDocumentParser::appendBytes(DocumentWriter& writer, const uint8_t* data, size_t length)
{
if (!length)
return;
String decoded = writer.decoder().decode(data, length); // 真正解码发生在这里
if (decoded.isEmpty())
return;
writer.reportDataReceived();
append(decoded.releaseImpl());
}
// 只保留了最重要的部分
String TextResourceDecoder::decode(const char* data, size_t length)
{
...
// 如果是HTML文件,就从head标签中寻找字符集
if ((m_contentType == HTML || m_contentType == XML) && !m_checkedForHeadCharset) // HTML and XML
if (!checkForHeadCharset(data, length, movedDataToBuffer))
return emptyString();
...
// m_encoding存储者从HTML文件中找到的编码名称
if (!m_codec)
m_codec = newTextCodec(m_encoding); // 创建具体的编码器
...
// 解码并返回
String result = m_codec->decode(m_buffer.data() + lengthOfBOM, m_buffer.size() - lengthOfBOM, false, m_contentType == XML && !m_useLenientXMLDecoding, m_sawError);
m_buffer.clear(); // 清空存储的原始未解码的HTML字节流
return result;
}
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <!-- 字符集指定-->
<title>DOM Tree</title>
<script>window.name = 'Lucy';</script>
</head>
// 只保留了关健代码
bool TextResourceDecoder::checkForHeadCharset(const char* data, size_t len, bool& movedDataToBuffer)
{
...
// This is not completely efficient, since the function might go
// through the HTML head several times.
size_t oldSize = m_buffer.size();
m_buffer.grow(oldSize + len);
memcpy(m_buffer.data() + oldSize, data, len); // 将字节流数据拷贝到自己的缓存m_buffer里面
movedDataToBuffer = true;
// Continue with checking for an HTML meta tag if we were already doing so.
if (m_charsetParser)
return checkForMetaCharset(data, len); // 如果已经存在了meta标签解析器,直接开始解析
....
m_charsetParser = makeUnique<HTMLMetaCharsetParser>(); // 创建meta标签解析器
return checkForMetaCharset(data, len);
}
bool TextResourceDecoder::checkForMetaCharset(const char* data, size_t length)
{
if (!m_charsetParser->checkForMetaCharset(data, length)) // 解析meta标签字符集
return false;
setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag); // 找到后设置字符编码名称
m_charsetParser = nullptr;
m_checkedForHeadCharset = true;
return true;
}
// 只保留了关健代码
bool HTMLMetaCharsetParser::checkForMetaCharset(const char* data, size_t length)
{
if (m_doneChecking) // 标志位,避免重复解析
return true;
// We still don't have an encoding, and are in the head.
// The following tags are allowed in <head>:
// SCRIPT|STYLE|META|LINK|OBJECT|TITLE|BASE
//
// We stop scanning when a tag that is not permitted in <head>
// is seen, rather when </head> is seen, because that more closely
// matches behavior in other browsers; more details in
// <http://bugs.webkit.org/show_bug.cgi?id=3590>.
//
// Additionally, we ignore things that looks like tags in <title>, <script>
// and <noscript>; see <http://bugs.webkit.org/show_bug.cgi?id=4560>,
// <http://bugs.webkit.org/show_bug.cgi?id=12165> and
// <http://bugs.webkit.org/show_bug.cgi?id=12389>.
//
// Since many sites have charset declarations after <body> or other tags
// that are disallowed in <head>, we don't bail out until we've checked at
// least bytesToCheckUnconditionally bytes of input.
constexpr int bytesToCheckUnconditionally = 1024; // 如果解析了1024个字符还未找到带有字符集的<meta>标签,整个解析也算完成,此时没有解析到正确的字符集,就使用默认编码windows-1252(等同于ISO-8859-1)
bool ignoredSawErrorFlag;
m_input.append(m_codec->decode(data, length, false, false, ignoredSawErrorFlag)); // 对字节流进行解码
while (auto token = m_tokenizer.nextToken(m_input)) { // m_tokenizer进行分词操作,找meta标签也需要进行分词,分词操作后面讲
bool isEnd = token->type() == HTMLToken::EndTag;
if (isEnd || token->type() == HTMLToken::StartTag) {
AtomString tagName(token->name());
if (!isEnd) {
m_tokenizer.updateStateFor(tagName);
if (tagName == metaTag && processMeta(*token)) { // 找到meta标签进行处理
m_doneChecking = true;
return true; // 如果找到了带有编码的meta标签,直接返回
}
}
if (tagName != scriptTag && tagName != noscriptTag
&& tagName != styleTag && tagName != linkTag
&& tagName != metaTag && tagName != objectTag
&& tagName != titleTag && tagName != baseTag
&& (isEnd || tagName != htmlTag)
&& (isEnd || tagName != headTag)) {
m_inHeadSection = false;
}
}
if (!m_inHeadSection && m_input.numberOfCharactersConsumed() >= bytesToCheckUnconditionally) { // 如果分词已经进入了<body>标签范围,同时分词数量已经超过了1024,也算成功
m_doneChecking = true;
return true;
}
}
return false;
}
bool HTMLMetaCharsetParser::processMeta(HTMLToken& token)
{
AttributeList attributes;
for (auto& attribute : token.attributes()) { // 获取meta标签属性
String attributeName = StringImpl::create8BitIfPossible(attribute.name);
String attributeValue = StringImpl::create8BitIfPossible(attribute.value);
attributes.append(std::make_pair(attributeName, attributeValue));
}
m_encoding = encodingFromMetaAttributes(attributes); // 从属性中找字符集设置属性charset
return m_encoding.isValid();
}
void DecodedDataDocumentParser::flush(DocumentWriter& writer)
{
String remainingData = writer.decoder().flush();
if (remainingData.isEmpty())
return;
writer.reportDataReceived();
append(remainingData.releaseImpl()); // 解码后的字符流存储到HTMLDocumentParser
}
2.1.3 解码总结
2.2 分词
2.2.1 分词类图
// 只保留了主要信息
class HTMLToken {
public:
enum Type { // Token的类型
Uninitialized, // Token初始化时的类型
DOCTYPE, // 代表Token是DOCType标签
StartTag, // 代表Token是一个开始标签
EndTag, // 代表Token是一个结束标签
Comment, // 代表Token是一个注释
Character, // 代表Token是文本
EndOfFile, // 代表Token是文件结尾
};
struct Attribute { // 存储属性的数据结构
Vector<UChar, 32> name; // 属性名
Vector<UChar, 64> value; // 属性值
// Used by HTMLSourceTracker.
unsigned startOffset;
unsigned endOffset;
};
typedef Vector<Attribute, 10> AttributeList; // 属性列表
typedef Vector<UChar, 256> DataVector; // 存储Token名
...
private:
Type m_type;
DataVector m_data;
// For StartTag and EndTag
bool m_selfClosing; // Token是注入<img>一样自结束标签
AttributeList m_attributes;
Attribute* m_currentAttribute; // 当前正在解析的属性
};
2.2.2 分词流程
// 只保留关健代码
bool HTMLDocumentParser::pumpTokenizerLoop(SynchronousMode mode, bool parsingFragment, PumpSession& session)
{
do { // 分词循环体开始
...
if (UNLIKELY(mode == AllowYield && m_parserScheduler->shouldYieldBeforeToken(session))) // 避免长时间处于分词循环中,这里根据条件暂时退出循环
return true;
if (!parsingFragment)
m_sourceTracker.startToken(m_input.current(), m_tokenizer);
auto token = m_tokenizer.nextToken(m_input.current()); // 进行分词操作,取出一个token
if (!token)
return false; // 分词没有产生token,就跳出循环
if (!parsingFragment)
m_sourceTracker.endToken(m_input.current(), m_tokenizer);
constructTreeFromHTMLToken(token); // 根据token构建DOM树
} while (!isStopped());
return false;
}
// 只保留关健代码
bool HTMLParserScheduler::shouldYieldBeforeToken(PumpSession& session)
{
...
// numberOfTokensBeforeCheckingForYield是静态变量,定义为4096
// session.processedTokensOnLastCheck表示从上一次退出为止,以及处理过的token个数
// session.didSeeScript表示在分词过程中是否出现过script标签
if (UNLIKELY(session.processedTokens > session.processedTokensOnLastCheck + numberOfTokensBeforeCheckingForYield || session.didSeeScript))
return checkForYield(session);
++session.processedTokens;
return false;
}
bool HTMLParserScheduler::checkForYield(PumpSession& session)
{
session.processedTokensOnLastCheck = session.processedTokens;
session.didSeeScript = false;
Seconds elapsedTime = MonotonicTime::now() - session.startTime;
return elapsedTime > m_parserTimeLimit; // m_parserTimeLimit的值默认是500ms,从分词开始超过500ms就要先yield
}
// 保留关键代码
void HTMLDocumentParser::pumpTokenizer(SynchronousMode mode)
{
...
if (shouldResume) // 从pumpTokenizerLoop中yield退出时返回值为true
m_parserScheduler->scheduleForResume();
}
void HTMLParserScheduler::scheduleForResume()
{
ASSERT(!m_suspended);
m_continueNextChunkTimer.startOneShot(0_s); // 触发timer(0s后触发),触发后的响应函数为HTMLParserScheduler::continueNextChunkTimerFired
}
// 保留关健代码
void HTMLParserScheduler::continueNextChunkTimerFired()
{
...
m_parser.resumeParsingAfterYield(); // 重新Resume分词过程
}
void HTMLDocumentParser::resumeParsingAfterYield()
{
// pumpTokenizer can cause this parser to be detached from the Document,
// but we need to ensure it isn't deleted yet.
Ref<HTMLDocumentParser> protectedThis(*this);
// We should never be here unless we can pump immediately.
// Call pumpTokenizer() directly so that ASSERTS will fire if we're wrong.
pumpTokenizer(AllowYield); // 重新进入分词过程,该函数会调用pumpTokenizerLoop
endIfDelayed();
}
// 只保留关键代码
bool HTMLTokenizer::processToken(SegmentedString& source)
{
...
if (!m_preprocessor.peek(source, isNullCharacterSkippingState(m_state))) // 取出source内部指向的字符,赋给m_nextInputCharacter
return haveBufferedCharacterToken();
UChar character = m_preprocessor.nextInputCharacter(); // 获取character
// https://html.spec.whatwg.org/#tokenization
switch (m_state) { // 进行状态转换,m_state初始值为DataState
...
}
return false;
}
// Returns whether we succeeded in peeking at the next character.
// The only way we can fail to peek is if there are no more
// characters in |source| (after collapsing \r\n, etc).
ALWAYS_INLINE bool InputStreamPreprocessor::peek(SegmentedString& source, bool skipNullCharacters = false)
{
if (UNLIKELY(source.isEmpty()))
return false;
m_nextInputCharacter = source.currentCharacter(); // 获取字符流source内部指向的当前字符
// Every branch in this function is expensive, so we have a
// fast-reject branch for characters that don't require special
// handling. Please run the parser benchmark whenever you touch
// this function. It's very hot.
constexpr UChar specialCharacterMask = '\n' | '\r' | '\0';
if (LIKELY(m_nextInputCharacter & ~specialCharacterMask)) {
m_skipNextNewLine = false;
return true;
}
return processNextInputCharacter(source, skipNullCharacters); // 跳过空字符,将\r\n换行符合并成\n
}
bool InputStreamPreprocessor::processNextInputCharacter(SegmentedString& source, bool skipNullCharacters)
{
ProcessAgain:
ASSERT(m_nextInputCharacter == source.currentCharacter());
// 针对\r\n换行符,下面if语句处理\r字符并且设置m_skipNextNewLine=true,后面处理\n就直接忽略
if (m_nextInputCharacter == '\n' && m_skipNextNewLine) {
m_skipNextNewLine = false;
source.advancePastNewline(); // 向前移动字符
if (source.isEmpty())
return false;
m_nextInputCharacter = source.currentCharacter();
}
// 如果是\r\n连续的换行符,那么第一次遇到\r字符,将\r字符替换成\n字符,同时设置标志m_skipNextNewLine=true
if (m_nextInputCharacter == '\r') {
m_nextInputCharacter = '\n';
m_skipNextNewLine = true;
return true;
}
m_skipNextNewLine = false;
if (m_nextInputCharacter || isAtEndOfFile(source))
return true;
// 跳过空字符
if (skipNullCharacters && !m_tokenizer.neverSkipNullCharacters()) {
source.advancePastNonNewline();
if (source.isEmpty())
return false;
m_nextInputCharacter = source.currentCharacter();
goto ProcessAgain; // 跳转到开头
}
m_nextInputCharacter = replacementCharacter;
return true;
}
BEGIN_STATE(DataState) // 刚开始解析是DataState状态if (character == '&') ADVANCE_PAST_NON_NEWLINE_TO(CharacterReferenceInDataState);if (character == '<') {// 整个字符流一开始是'<',那么表示是一个标签的开始if (haveBufferedCharacterToken()) RETURN_IN_CURRENT_STATE(true); ADVANCE_PAST_NON_NEWLINE_TO(TagOpenState); // 跳转到TagOpenState状态,并取去下一个字符是'!" }if (character == kEndOfFileMarker)return emitEndOfFile(source); bufferCharacter(character); ADVANCE_TO(DataState);END_STATE()// ADVANCE_PAST_NON_NEWLINE_TO定义#define ADVANCE_PAST_NON_NEWLINE_TO(newState) \do { \if (!m_preprocessor.advancePastNonNewline(source, isNullCharacterSkippingState(newState))) { \ // 如果往下移动取不到下一个字符 m_state = newState; \ // 保存状态return haveBufferedCharacterToken(); \ // 返回 } \ character = m_preprocessor.nextInputCharacter(); \ // 先取出下一个字符 goto newState; \ // 跳转到指定状态 } while (false)BEGIN_STATE(TagOpenState)if (character == '!') // 满足此条件 ADVANCE_PAST_NON_NEWLINE_TO(MarkupDeclarationOpenState); // 同理,跳转到MarkupDeclarationOpenState状态,并且取出下一个字符'D'if (character == '/') ADVANCE_PAST_NON_NEWLINE_TO(EndTagOpenState);if (isASCIIAlpha(character)) { m_token.beginStartTag(convertASCIIAlphaToLower(character)); ADVANCE_PAST_NON_NEWLINE_TO(TagNameState); }if (character == '?') { parseError();// The spec consumes the current character before switching// to the bogus comment state, but it's easier to implement// if we reconsume the current character. RECONSUME_IN(BogusCommentState); } parseError(); bufferASCIICharacter('<'); RECONSUME_IN(DataState);END_STATE()BEGIN_STATE(MarkupDeclarationOpenState)if (character == '-') { auto result = source.advancePast("--");if (result == SegmentedString::DidMatch) { m_token.beginComment(); SWITCH_TO(CommentStartState); }if (result == SegmentedString::NotEnoughCharacters) RETURN_IN_CURRENT_STATE(haveBufferedCharacterToken()); } else if (isASCIIAlphaCaselessEqual(character, 'd')) { // 由于character == 'D',满足此条件 auto result = source.advancePastLettersIgnoringASCIICase("doctype"); // 看解码后的字符流中是否有完整的"doctype"if (result == SegmentedString::DidMatch) SWITCH_TO(DOCTYPEState); // 如果匹配,则跳转到DOCTYPEState,同时取出当前指向的字符,由于上面source字符流已经移动了"doctype",因此此时取出的字符为'>'if (result == SegmentedString::NotEnoughCharacters) // 如果不匹配 RETURN_IN_CURRENT_STATE(haveBufferedCharacterToken()); // 保存状态,直接返回 } else if (character == '[' && shouldAllowCDATA()) { auto result = source.advancePast("[CDATA[");if (result == SegmentedString::DidMatch) SWITCH_TO(CDATASectionState);if (result == SegmentedString::NotEnoughCharacters) RETURN_IN_CURRENT_STATE(haveBufferedCharacterToken()); } parseError(); RECONSUME_IN(BogusCommentState);END_STATE()#define SWITCH_TO(newState) \do { \if (!m_preprocessor.peek(source, isNullCharacterSkippingState(newState))) { \ m_state = newState; \return haveBufferedCharacterToken(); \ } \ character = m_preprocessor.nextInputCharacter(); \ // 取出下一个字符 goto newState; \ // 跳转到指定的state } while (false)#define RETURN_IN_CURRENT_STATE(expression) \do { \ m_state = currentState; \ // 保存当前状态return expression; \ } while (false)BEGIN_STATE(DOCTYPEState)if (isTokenizerWhitespace(character)) ADVANCE_TO(BeforeDOCTYPENameState);if (character == kEndOfFileMarker) { parseError(); m_token.beginDOCTYPE(); m_token.setForceQuirks();return emitAndReconsumeInDataState(); } parseError(); RECONSUME_IN(BeforeDOCTYPENameState);END_STATE()#define RECONSUME_IN(newState) \do { \ // 直接跳转到指定state goto newState; \ } while (false) BEGIN_STATE(BeforeDOCTYPENameState)if (isTokenizerWhitespace(character)) ADVANCE_TO(BeforeDOCTYPENameState);if (character == '>') { // character == '>',匹配此处,到此DOCTYPE标签匹配完毕 parseError(); m_token.beginDOCTYPE(); m_token.setForceQuirks();return emitAndResumeInDataState(source); }if (character == kEndOfFileMarker) { parseError(); m_token.beginDOCTYPE(); m_token.setForceQuirks();return emitAndReconsumeInDataState(); } m_token.beginDOCTYPE(toASCIILower(character)); ADVANCE_PAST_NON_NEWLINE_TO(DOCTYPENameState);END_STATE()inline bool HTMLTokenizer::emitAndResumeInDataState(SegmentedString& source){ saveEndTagNameIfNeeded(); m_state = DataState; // 重置状态为初始状态DataState source.advancePastNonNewline(); // 移动到下一个字符return true;}
BEGIN_STATE(TagOpenState)
if (character == '!')
ADVANCE_PAST_NON_NEWLINE_TO(MarkupDeclarationOpenState);
if (character == '/')
ADVANCE_PAST_NON_NEWLINE_TO(EndTagOpenState);
if (isASCIIAlpha(character)) { // 在开标签状态下,当前字符为'h'
m_token.beginStartTag(convertASCIIAlphaToLower(character)); // 将'h'添加到Token名中
ADVANCE_PAST_NON_NEWLINE_TO(TagNameState); // 跳转到TagNameState,并移动到下一个字符't'
}
if (character == '?') {
parseError();
// The spec consumes the current character before switching
// to the bogus comment state, but it's easier to implement
// if we reconsume the current character.
RECONSUME_IN(BogusCommentState);
}
parseError();
bufferASCIICharacter('<');
RECONSUME_IN(DataState);
END_STATE()
BEGIN_STATE(TagNameState)
if (isTokenizerWhitespace(character))
ADVANCE_TO(BeforeAttributeNameState);
if (character == '/')
ADVANCE_PAST_NON_NEWLINE_TO(SelfClosingStartTagState);
if (character == '>') // 在这个状态下遇到起始标签终止字符
return emitAndResumeInDataState(source); // 当前分词结束,重置分词状态为DataState
if (m_options.usePreHTML5ParserQuirks && character == '<')
return emitAndReconsumeInDataState();
if (character == kEndOfFileMarker) {
parseError();
RECONSUME_IN(DataState);
}
m_token.appendToName(toASCIILower(character)); // 将当前字符添加到Token名
ADVANCE_PAST_NON_NEWLINE_TO(TagNameState); // 继续跳转到当前状态,并移动到下一个字符
END_STATE()
<!-- div标签有两个属性,属性名为class和align,它们的值都带有引号 -->
<div class="news" align="center">Hello,World!</div>
<!-- 属性值也可以不带引号 -->
<div class=news align=center>Hello,World!</div>
BEGIN_STATE(TagNameState)if (isTokenizerWhitespace(character)) // 在解析TagName时遇到空白字符,标志属性开始 ADVANCE_TO(BeforeAttributeNameState);if (character == '/') ADVANCE_PAST_NON_NEWLINE_TO(SelfClosingStartTagState);if (character == '>')return emitAndResumeInDataState(source);if (m_options.usePreHTML5ParserQuirks && character == '<')return emitAndReconsumeInDataState();if (character == kEndOfFileMarker) { parseError(); RECONSUME_IN(DataState); } m_token.appendToName(toASCIILower(character)); ADVANCE_PAST_NON_NEWLINE_TO(TagNameState);END_STATE()#define ADVANCE_TO(newState) \do { \if (!m_preprocessor.advance(source, isNullCharacterSkippingState(newState))) { \ // 移动到下一个字符 m_state = newState; \return haveBufferedCharacterToken(); \ } \ character = m_preprocessor.nextInputCharacter(); \ goto newState; \ // 跳转到指定状态 } while (false)BEGIN_STATE(BeforeAttributeNameState)if (isTokenizerWhitespace(character)) // 如果标签名后有连续空格,那么就不停的跳过,在当前状态不停循环 ADVANCE_TO(BeforeAttributeNameState);if (character == '/') ADVANCE_PAST_NON_NEWLINE_TO(SelfClosingStartTagState);if (character == '>')return emitAndResumeInDataState(source);if (m_options.usePreHTML5ParserQuirks && character == '<')return emitAndReconsumeInDataState();if (character == kEndOfFileMarker) { parseError(); RECONSUME_IN(DataState); }if (character == '"' || character == '\'' || character == '<' || character == '=') parseError(); m_token.beginAttribute(source.numberOfCharactersConsumed()); // Token的属性列表增加一个,用来存放新的属性名与属性值 m_token.appendToAttributeName(toASCIILower(character)); // 添加属性名 ADVANCE_PAST_NON_NEWLINE_TO(AttributeNameState); // 跳转到AttributeNameState,并且移动到下一个字符END_STATE()BEGIN_STATE(AttributeNameState)if (isTokenizerWhitespace(character)) ADVANCE_TO(AfterAttributeNameState);if (character == '/') ADVANCE_PAST_NON_NEWLINE_TO(SelfClosingStartTagState);if (character == '=') ADVANCE_PAST_NON_NEWLINE_TO(BeforeAttributeValueState); // 在解析属性名的过程中如果碰到=,说明属性名结束,属性值就要开始if (character == '>')return emitAndResumeInDataState(source);if (m_options.usePreHTML5ParserQuirks && character == '<')return emitAndReconsumeInDataState();if (character == kEndOfFileMarker) { parseError(); RECONSUME_IN(DataState); }if (character == '"' || character == '\'' || character == '<' || character == '=') parseError(); m_token.appendToAttributeName(toASCIILower(character)); ADVANCE_PAST_NON_NEWLINE_TO(AttributeNameState);END_STATE()BEGIN_STATE(BeforeAttributeValueState)if (isTokenizerWhitespace(character)) ADVANCE_TO(BeforeAttributeValueState);if (character == '"') ADVANCE_PAST_NON_NEWLINE_TO(AttributeValueDoubleQuotedState); // 有的属性值有引号包围,这里跳转到AttributeValueDoubleQuotedState,并移动到下一个字符if (character == '&') RECONSUME_IN(AttributeValueUnquotedState);if (character == '\'') ADVANCE_PAST_NON_NEWLINE_TO(AttributeValueSingleQuotedState);if (character == '>') { parseError();return emitAndResumeInDataState(source); }if (character == kEndOfFileMarker) { parseError(); RECONSUME_IN(DataState); }if (character == '<' || character == '=' || character == '`') parseError(); m_token.appendToAttributeValue(character); // 有的属性值没有引号包围,添加属性值字符到Token ADVANCE_PAST_NON_NEWLINE_TO(AttributeValueUnquotedState); // 跳转到AttributeValueUnquotedState,并移动到下一个字符END_STATE()BEGIN_STATE(AttributeValueDoubleQuotedState)if (character == '"') { // 在当前状态下如果遇到引号,说明属性值结束 m_token.endAttribute(source.numberOfCharactersConsumed()); // 结束属性解析 ADVANCE_PAST_NON_NEWLINE_TO(AfterAttributeValueQuotedState); // 跳转到AfterAttributeValueQuotedState,并移动到下一个字符 }if (character == '&') { m_additionalAllowedCharacter = '"'; ADVANCE_PAST_NON_NEWLINE_TO(CharacterReferenceInAttributeValueState); }if (character == kEndOfFileMarker) { parseError(); m_token.endAttribute(source.numberOfCharactersConsumed()); RECONSUME_IN(DataState); } m_token.appendToAttributeValue(character); // 将属性值字符添加到Token ADVANCE_TO(AttributeValueDoubleQuotedState); // 跳转到当前状态END_STATE()BEGIN_STATE(AfterAttributeValueQuotedState)if (isTokenizerWhitespace(character)) ADVANCE_TO(BeforeAttributeNameState); // 属性值解析完毕,如果后面继续跟着空白字符,说明后续还有属性要解析,调回到BeforeAttributeNameStateif (character == '/') ADVANCE_PAST_NON_NEWLINE_TO(SelfClosingStartTagState);if (character == '>')return emitAndResumeInDataState(source); // 属性值解析完毕,如果遇到'>'字符,说明整个标签也要解析完毕了,此时结束当前标签解析,并且重置分词状态为DataState,并移动到下一个字符if (m_options.usePreHTML5ParserQuirks && character == '<')return emitAndReconsumeInDataState();if (character == kEndOfFileMarker) { parseError(); RECONSUME_IN(DataState); } parseError(); RECONSUME_IN(BeforeAttributeNameState);END_STATE()BEGIN_STATE(AttributeValueUnquotedState)if (isTokenizerWhitespace(character)) { // 当解析不带引号的属性值时遇到空白字符(这与带引号的属性值不一样,带引号的属性值可以包含空白字符),说明当前属性解析完毕,后面还有其他属性,跳转到BeforeAttributeNameState,并且移动到下一个字符 m_token.endAttribute(source.numberOfCharactersConsumed()); ADVANCE_TO(BeforeAttributeNameState); }if (character == '&') { m_additionalAllowedCharacter = '>'; ADVANCE_PAST_NON_NEWLINE_TO(CharacterReferenceInAttributeValueState); }if (character == '>') { // 解析过程中如果遇到'>'字符,说明整个标签也要解析完毕了,此时结束当前标签解析,并且重置分词状态为DataState,并移动到下一个字符 m_token.endAttribute(source.numberOfCharactersConsumed());return emitAndResumeInDataState(source); }if (character == kEndOfFileMarker) { parseError(); m_token.endAttribute(source.numberOfCharactersConsumed()); RECONSUME_IN(DataState); }if (character == '"' || character == '\'' || character == '<' || character == '=' || character == '`') parseError(); m_token.appendToAttributeValue(character); // 将遇到的属性值字符添加到Token ADVANCE_PAST_NON_NEWLINE_TO(AttributeValueUnquotedState); // 跳转到当前状态,并且移动到下一个字符END_STATE()
<!-- div标签中的纯文本 Hello,Word! -->
<div class=news align=center>Hello,World!</div>
<!-- script标签中的纯文本 window.name = 'Lucy'; -->
<script>window.name = 'Lucy';</script>
BEGIN_STATE(DataState)
if (character == '&')
ADVANCE_PAST_NON_NEWLINE_TO(CharacterReferenceInDataState);
if (character == '<') { // 如果在解析文本的过程中遇到开标签,分两种情况
if (haveBufferedCharacterToken()) // 第一种,如果缓存了文本字符就直接按当前DataState返回,并不移动字符,所以下次再进入分词操作时取到的字符仍为'<'
RETURN_IN_CURRENT_STATE(true);
ADVANCE_PAST_NON_NEWLINE_TO(TagOpenState); // 第二种,如果没有缓存任何文本字符,直接进入TagOpenState状态,进入到起始标签解析过程,并且移动下一个字符
}
if (character == kEndOfFileMarker)
return emitEndOfFile(source);
bufferCharacter(character); // 缓存遇到的字符
ADVANCE_TO(DataState); // 循环跳转到当前DataState状态,并且移动到下一个字符
END_STATE()
2.3 创建节点与添加节点
2.3.1 相关类图
2.3.2 创建、添加流程
// 只保留关健代码
void HTMLTreeBuilder::constructTree(AtomHTMLToken&& token)
{
...
if (shouldProcessTokenInForeignContent(token))
processTokenInForeignContent(WTFMove(token));
else
processToken(WTFMove(token)); // HTMLToken在这里被处理
...
m_tree.executeQueuedTasks(); // HTMLContructionSiteTask在这里被执行,有时候也直接在创建的过程中直接执行,然后这个方法发现队列为空就会直接返回
// The tree builder might have been destroyed as an indirect result of executing the queued tasks.
}
void HTMLConstructionSite::executeQueuedTasks()
{
if (m_taskQueue.isEmpty()) // 队列为空,就直接返回
return;
// Copy the task queue into a local variable in case executeTask
// re-enters the parser.
TaskQueue queue = WTFMove(m_taskQueue);
for (auto& task : queue) // 这里的task就是HTMLContructionSiteTask
executeTask(task); // 执行task
// We might be detached now.
}
void HTMLTreeBuilder::processToken(AtomHTMLToken&& token)
{
switch (token.type()) {
case HTMLToken::Uninitialized:
ASSERT_NOT_REACHED();
break;
case HTMLToken::DOCTYPE: // HTML中的DOCType标签
m_shouldSkipLeadingNewline = false;
processDoctypeToken(WTFMove(token));
break;
case HTMLToken::StartTag: // 起始HTML标签
m_shouldSkipLeadingNewline = false;
processStartTag(WTFMove(token));
break;
case HTMLToken::EndTag: // 结束HTML标签
m_shouldSkipLeadingNewline = false;
processEndTag(WTFMove(token));
break;
case HTMLToken::Comment: // HTML中的注释
m_shouldSkipLeadingNewline = false;
processComment(WTFMove(token));
return;
case HTMLToken::Character: // HTML中的纯文本
processCharacter(WTFMove(token));
break;
case HTMLToken::EndOfFile: // HTML结束标志
m_shouldSkipLeadingNewline = false;
processEndOfFile(WTFMove(token));
break;
}
}
// 只保留关健代码
void HTMLTreeBuilder::processDoctypeToken(AtomHTMLToken&& token)
{
ASSERT(token.type() == HTMLToken::DOCTYPE);
if (m_insertionMode == InsertionMode::Initial) { // m_insertionMode的初始值就是InsertionMode::Initial
m_tree.insertDoctype(WTFMove(token)); // 插入DOCTYPE标签
m_insertionMode = InsertionMode::BeforeHTML; // 插入DOCTYPE标签之后,m_insertionMode设置为InsertionMode::BeforeHTML,表示下面要开是HTML标签插入
return;
}
...
}
// 只保留关健代码
void HTMLConstructionSite::insertDoctype(AtomHTMLToken&& token)
{
...
// m_attachmentRoot就是Document对象,文档根节点
// DocumentType::create方法创建出DOCTYPE节点
// attachLater方法内部创建出HTMLContructionSiteTask
attachLater(m_attachmentRoot, DocumentType::create(m_document, token.name(), publicId, systemId));
...
}
// 只保留关健代码
void HTMLConstructionSite::attachLater(ContainerNode& parent, Ref<Node>&& child, bool selfClosing)
{
...
HTMLConstructionSiteTask task(HTMLConstructionSiteTask::Insert); // 创建HTMLConstructionSiteTask
task.parent = &parent; // task持有当前节点的父节点
task.child = WTFMove(child); // task持有需要操作的节点
task.selfClosing = selfClosing; // 是否自关闭节点
// Add as a sibling of the parent if we have reached the maximum depth allowed.
// m_openElements就是HTMLElementStack,在这里还看不到它的作用,后面会讲。这里可以看到这个stack里面加入的对象个数是有限制的,最大不超过512个。
// 所以如果一个HTML标签嵌套过多的子标签,就会触发这里的操作
if (m_openElements.stackDepth() > m_maximumDOMTreeDepth && task.parent->parentNode())
task.parent = task.parent->parentNode(); // 满足条件,就会将当前节点添加到爷爷节点,而不是父节点
ASSERT(task.parent);
m_taskQueue.append(WTFMove(task)); // 将task添加到Queue当中
}
// 方法位于HTMLContructionSite.cpp
static inline void executeTask(HTMLConstructionSiteTask& task)
{
switch (task.operation) { // HTMLConstructionSiteTask存储了自己要做的操作,构建DOM树一般都是Insert操作
case HTMLConstructionSiteTask::Insert:
executeInsertTask(task); // 这里执行insert操作
return;
// All the cases below this point are only used by the adoption agency.
case HTMLConstructionSiteTask::InsertAlreadyParsedChild:
executeInsertAlreadyParsedChildTask(task);
return;
case HTMLConstructionSiteTask::Reparent:
executeReparentTask(task);
return;
case HTMLConstructionSiteTask::TakeAllChildrenAndReparent:
executeTakeAllChildrenAndReparentTask(task);
return;
}
ASSERT_NOT_REACHED();
}
// 只保留关健代码,方法位于HTMLContructionSite.cpp
static inline void executeInsertTask(HTMLConstructionSiteTask& task)
{
ASSERT(task.operation == HTMLConstructionSiteTask::Insert);
insert(task); // 继续调用插入方法
...
}
// 只保留关健代码,方法位于HTMLContructionSite.cpp
static inline void insert(HTMLConstructionSiteTask& task)
{
...
ASSERT(!task.child->parentNode());
if (task.nextChild)
task.parent->parserInsertBefore(*task.child, *task.nextChild);
else
task.parent->parserAppendChild(*task.child); // 调用父节点方法继续插入
}
// 只保留关健代码
void ContainerNode::parserAppendChild(Node& newChild)
{
...
executeNodeInsertionWithScriptAssertion(*this, newChild, ChildChange::Source::Parser, ReplacedAllChildren::No, [&] {
if (&document() != &newChild.document())
document().adoptNode(newChild);
appendChildCommon(newChild); // 在Block回调中调用此方法继续插入
...
});
}
// 最终调用的是这个方法进行插入
void ContainerNode::appendChildCommon(Node& child)
{
ScriptDisallowedScope::InMainThread scriptDisallowedScope;
child.setParentNode(this);
if (m_lastChild) { // 父节点已经插入子节点,运行在这里
child.setPreviousSibling(m_lastChild);
m_lastChild->setNextSibling(&child);
} else
m_firstChild = &child; // 如果父节点是首次插入子节点,运行在这里
m_lastChild = &child; // 更新m_lastChild
}
// processStartTag内部有很多状态处理,这里只保留关健代码
void HTMLTreeBuilder::processStartTag(AtomHTMLToken&& token)
{
ASSERT(token.type() == HTMLToken::StartTag);
switch (m_insertionMode) {
case InsertionMode::Initial:
defaultForInitial();
ASSERT(m_insertionMode == InsertionMode::BeforeHTML);
FALLTHROUGH;
case InsertionMode::BeforeHTML:
if (token.name() == htmlTag) { // html标签在这里处理
m_tree.insertHTMLHtmlStartTagBeforeHTML(WTFMove(token));
m_insertionMode = InsertionMode::BeforeHead; // 插入完html标签,m_insertionMode = InsertionMode::BeforeHead,表明即将处理head标签
return;
}
...
}
}
// 只保留关健代码
void HTMLConstructionSite::insertHTMLHtmlStartTagBeforeHTML(AtomHTMLToken&& token)
{
auto element = HTMLHtmlElement::create(m_document); // 创建html节点
setAttributes(element, token, m_parserContentPolicy);
attachLater(m_attachmentRoot, element.copyRef()); // 同样调用了attachLater方法,与DOCTYPE类似
m_openElements.pushHTMLHtmlElement(HTMLStackItem::create(element.copyRef(), WTFMove(token))); // 注意这里,这里向HTMLElementStack中压入了正在插入的html起始标签
executeQueuedTasks(); // 这里在插入操作直接执行了task,外面HTMLTreeBuilder::constructTree方法调用的executeQueuedTasks方法就会直接返回
...
}
// 只保留关健代码
void HTMLConstructionSite::insertTextNode(const String& characters, WhitespaceMode whitespaceMode)
{
HTMLConstructionSiteTask task(HTMLConstructionSiteTask::Insert);
task.parent = ¤tNode(); // 直接取HTMLElementStack m_openElements的栈顶节点,此时节点是title
...
unsigned currentPosition = 0;
unsigned lengthLimit = shouldUseLengthLimit(*task.parent) ? Text::defaultLengthLimit : std::numeric_limits<unsigned>::max(); // 限制文本节点最大包含的字符个数为65536
...
// 可以看到如果文本过长,会将分割成多个文本节点
while (currentPosition < characters.length()) {
AtomString charactersAtom = m_whitespaceCache.lookup(characters, whitespaceMode);
auto textNode = Text::createWithLengthLimit(task.parent->document(), charactersAtom.isNull() ? characters : charactersAtom.string(), currentPosition, lengthLimit);
// If we have a whole string of unbreakable characters the above could lead to an infinite loop. Exceeding the length limit is the lesser evil.
if (!textNode->length()) {
String substring = characters.substring(currentPosition);
AtomString substringAtom = m_whitespaceCache.lookup(substring, whitespaceMode);
textNode = Text::create(task.parent->document(), substringAtom.isNull() ? substring : substringAtom.string()); // 生成文本节点
}
currentPosition += textNode->length(); // 下一个文本节点包含的字符起点
ASSERT(currentPosition <= characters.length());
task.child = WTFMove(textNode);
executeTask(task); // 直接执行Task插入
}
}
// 代码内部有很多状态处理,这里只保留关健代码
void HTMLTreeBuilder::processEndTag(AtomHTMLToken&& token)
{
ASSERT(token.type() == HTMLToken::EndTag);
switch (m_insertionMode) {
...
case InsertionMode::Text: // 由于遇到title结束标签之前插入了文本,因此此时的插入模式就是InsertionMode::Text
m_tree.openElements().pop(); // 因为遇到了title结束标签,整个标签已经处理完毕,从HTMLElementStack栈中弹出栈顶元素title
m_insertionMode = m_originalInsertionMode; // 恢复之前的插入模式
break;
...
}