1 //####COPYRIGHTBEGIN####
3 // ----------------------------------------------------------------------------
4 // Copyright (C) 1998, 1999, 2000 Red Hat, Inc.
6 // This program is part of the eCos host tools.
8 // This program is free software; you can redistribute it and/or modify it
9 // under the terms of the GNU General Public License as published by the Free
10 // Software Foundation; either version 2 of the License, or (at your option)
13 // This program is distributed in the hope that it will be useful, but WITHOUT
14 // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
18 // You should have received a copy of the GNU General Public License along with
19 // this program; if not, write to the Free Software Foundation, Inc.,
20 // 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
22 // ----------------------------------------------------------------------------
24 //####COPYRIGHTEND####
27 //===========================================================================
28 //#####DESCRIPTIONBEGIN####
31 // Contact(s): julians
35 // Description: HTML parser/HTML Help file generator
42 //####DESCRIPTIONEND####
44 //===========================================================================
46 // ============================================================================
48 // ============================================================================
50 // ----------------------------------------------------------------------------
52 // ----------------------------------------------------------------------------
54 #pragma implementation "htmlparser.h"
57 // Includes other headers for precompiled compilation
64 #include "wx/textfile.h"
65 #include "wx/wfstream.h"
68 #include "htmlparser.h"
71 * wxSimpleHtmlAttribute
72 * Representation of an attribute
75 wxSimpleHtmlParser::wxSimpleHtmlParser()
82 wxSimpleHtmlParser::~wxSimpleHtmlParser()
87 bool wxSimpleHtmlParser::ParseFile(const wxString& filename)
91 if (textFile.Open(filename))
96 int count = textFile.GetLineCount();
97 for (i = 0; i < count; i++)
100 line = textFile.GetFirstLine();
102 line = textFile.GetNextLine();
105 if (i != (count - 1))
110 for ( line = textFile.GetFirstLine(); !textFile.Eof(); line = textFile.GetNextLine() )
118 return ParseString(text);
124 bool wxSimpleHtmlParser::ParseString(const wxString& str)
130 m_length = str.Length();
132 m_topLevel = new wxSimpleHtmlTag(wxT("TOPLEVEL"), wxSimpleHtmlTag_TopLevel);
134 return ParseHtml(m_topLevel);
137 // Main recursive parsing function
138 bool wxSimpleHtmlParser::ParseHtml(wxSimpleHtmlTag* parent)
147 else if (IsDirective())
149 wxSimpleHtmlTag* tag = ParseDirective();
151 parent->AppendTag(tag);
153 else if (IsTagClose())
155 wxSimpleHtmlTag* tag = ParseTagClose();
157 parent->AppendTag(tag);
159 else if (IsTagStartBracket(GetChar(m_pos)))
161 wxSimpleHtmlTag* tag = ParseTagHeader();
163 parent->AppendTag(tag);
167 // Just a text string
171 wxSimpleHtmlTag* tag = new wxSimpleHtmlTag(wxT("TEXT"), wxSimpleHtmlTag_Text);
173 parent->AppendTag(tag);
179 // Plain text, up until an angled bracket
180 bool wxSimpleHtmlParser::ParseText(wxString& text)
182 while (!Eof() && GetChar(m_pos) != wxT('<'))
184 text += GetChar(m_pos);
190 wxSimpleHtmlTag* wxSimpleHtmlParser::ParseTagHeader()
192 if (IsTagStartBracket(GetChar(m_pos)))
198 ReadWord(word, TRUE);
202 wxSimpleHtmlTag* tag = new wxSimpleHtmlTag(word, wxSimpleHtmlTag_Open);
204 ParseAttributes(tag);
208 if (IsTagEndBracket(GetChar(m_pos)))
217 wxSimpleHtmlTag* wxSimpleHtmlParser::ParseTagClose()
219 Matches(wxT("</"), TRUE);
224 ReadWord(word, TRUE);
229 wxSimpleHtmlTag* tag = new wxSimpleHtmlTag(word, wxSimpleHtmlTag_Close);
233 bool wxSimpleHtmlParser::ParseAttributes(wxSimpleHtmlTag* tag)
235 // Parse attributes of a tag header until we reach >
236 while (!IsTagEndBracket(GetChar(m_pos)) && !Eof())
240 wxString attrName, attrValue;
244 ReadString(attrName, TRUE);
245 tag->AppendAttribute(attrName, wxEmptyString);
247 else if (IsNumeric(GetChar(m_pos)))
249 ReadNumber(attrName, TRUE);
250 tag->AppendAttribute(attrName, wxEmptyString);
254 // Try to read an attribute name/value pair, or at least a name
256 ReadLiteral(attrName, TRUE);
259 if (GetChar(m_pos) == wxT('='))
265 ReadString(attrValue, TRUE);
266 else if (!Eof() && !IsTagEndBracket(GetChar(m_pos)))
267 ReadLiteral(attrValue, TRUE);
269 if (!attrName.IsEmpty())
270 tag->AppendAttribute(attrName, attrValue);
276 // e.g. <!DOCTYPE ....>
277 wxSimpleHtmlTag* wxSimpleHtmlParser::ParseDirective()
279 Matches(wxT("<!"), TRUE);
284 ReadWord(word, TRUE);
288 wxSimpleHtmlTag* tag = new wxSimpleHtmlTag(word, wxSimpleHtmlTag_Directive);
290 ParseAttributes(tag);
294 if (IsTagEndBracket(GetChar(m_pos)))
300 bool wxSimpleHtmlParser::ParseComment()
302 // Eat the comment tag start
303 Matches(wxT("<!--"), TRUE);
305 while (!Eof() && !Matches(wxT("-->"), TRUE))
313 bool wxSimpleHtmlParser::EatWhitespace()
315 while (!Eof() && IsWhitespace(GetChar(m_pos)))
320 bool wxSimpleHtmlParser::EatWhitespace(int& pos)
322 while (!Eof(pos) && IsWhitespace(GetChar(pos)))
327 bool wxSimpleHtmlParser::ReadString(wxString& str, bool eatIt)
330 if (GetChar(pos) == (int) '"')
333 while (!Eof(pos) && GetChar(pos) != (int) '"')
335 // TODO: how are quotes escaped in HTML?
336 str += (wxChar) GetChar(pos);
339 if (GetChar(pos) == (int) '"')
349 bool wxSimpleHtmlParser::ReadWord(wxString& str, bool eatIt)
353 if (!IsAlpha(GetChar(pos)))
356 str += (wxChar) GetChar(pos) ;
359 while (!Eof(pos) && IsWordChar(GetChar(pos)))
361 str += (wxChar) GetChar(pos);
369 bool wxSimpleHtmlParser::ReadNumber(wxString& str, bool eatIt)
373 if (!IsNumeric(GetChar(pos)))
376 str += (wxChar) GetChar(pos) ;
379 while (!Eof(pos) && IsNumeric(GetChar(pos)))
381 str += (wxChar) GetChar(pos);
389 // Could be number, string, whatever, but read up until whitespace or end of tag (but not a quoted string)
390 bool wxSimpleHtmlParser::ReadLiteral(wxString& str, bool eatIt)
394 while (!Eof(pos) && !IsWhitespace(GetChar(pos)) && !IsTagEndBracket(GetChar(pos)) && GetChar(pos) != wxT('='))
404 bool wxSimpleHtmlParser::IsTagClose()
406 return Matches(wxT("</"));
409 bool wxSimpleHtmlParser::IsComment()
411 return Matches(wxT("<!--"));
414 bool wxSimpleHtmlParser::IsDirective()
416 return Matches(wxT("<!"));
419 bool wxSimpleHtmlParser::IsString()
421 return (GetChar(m_pos) == (int) '"') ;
424 bool wxSimpleHtmlParser::IsWord()
426 return (IsAlpha(GetChar(m_pos)));
429 bool wxSimpleHtmlParser::IsTagStartBracket(int ch)
431 return (ch == wxT('<'));
434 bool wxSimpleHtmlParser::IsTagEndBracket(int ch)
436 return (ch == wxT('>'));
439 bool wxSimpleHtmlParser::IsWhitespace(int ch)
441 return ((ch == 13) || (ch == 10) || (ch == 32) || (ch == (int) '\t')) ;
444 bool wxSimpleHtmlParser::IsAlpha(int ch)
446 return (wxIsalpha((wxChar) ch) != 0);
449 bool wxSimpleHtmlParser::IsWordChar(int ch)
451 return (wxIsalpha((wxChar) ch) != 0 || ch == wxT('-') || ch == wxT('_') || IsNumeric(ch));
454 bool wxSimpleHtmlParser::IsNumeric(int ch)
456 return (wxIsdigit((wxChar) ch) != 0 || ch == wxT('-') || ch == wxT('.')) ;
459 // Matches this string (case insensitive)
460 bool wxSimpleHtmlParser::Matches(const wxString& tok, bool eatIt)
462 wxString text(m_text.Mid(m_pos, tok.Length()));
463 bool success = (text.CmpNoCase(tok) == 0) ;
464 if (success && eatIt)
466 m_pos += tok.Length();
471 // Safe way of getting a character
472 int wxSimpleHtmlParser::GetChar(size_t i) const
479 void wxSimpleHtmlParser::Clear()
484 m_text = wxEmptyString;
490 void wxSimpleHtmlParser::Write(wxOutputStream& stream)
493 m_topLevel->Write(stream);
496 bool wxSimpleHtmlParser::WriteFile(wxString& filename)
498 wxFileOutputStream fstream(filename);
510 * Representation of a tag or chunk of text
513 wxSimpleHtmlTag::wxSimpleHtmlTag(const wxString& tagName, int tagType)
523 wxSimpleHtmlTag::~wxSimpleHtmlTag()
530 void wxSimpleHtmlTag::ClearAttributes()
534 wxSimpleHtmlAttribute* attr = m_attributes;
537 wxSimpleHtmlAttribute* next = attr->m_next;
547 wxSimpleHtmlAttribute* wxSimpleHtmlTag::FindAttribute(const wxString& name) const
549 wxSimpleHtmlAttribute* attr = m_attributes;
552 if (attr->GetName().CmpNoCase(name) == 0)
561 void wxSimpleHtmlTag::AppendAttribute(const wxString& name, const wxString& value)
563 wxSimpleHtmlAttribute* attr = new wxSimpleHtmlAttribute(name, value);
567 wxSimpleHtmlAttribute* last = m_attributes;
577 void wxSimpleHtmlTag::ClearChildren()
581 wxSimpleHtmlTag* child = m_children;
584 wxSimpleHtmlTag* next = child->m_next;
586 child->m_next = NULL;
594 void wxSimpleHtmlTag::AppendTag(wxSimpleHtmlTag* tag)
599 wxSimpleHtmlTag* last = m_children;
604 tag->m_parent = this;
610 // Gets the text from this tag and its descendants
611 wxString wxSimpleHtmlTag::GetTagText()
616 wxSimpleHtmlTag* tag = m_children;
619 text += tag->GetTagText();
624 else if (GetType() == wxSimpleHtmlTag_Text)
627 return wxEmptyString;
630 int wxSimpleHtmlTag::GetAttributeCount() const
633 wxSimpleHtmlAttribute* attr = m_attributes;
642 wxSimpleHtmlAttribute* wxSimpleHtmlTag::GetAttribute(int i) const
645 wxSimpleHtmlAttribute* attr = m_attributes;
656 int wxSimpleHtmlTag::GetChildCount() const
659 wxSimpleHtmlTag* tag = m_children;
668 bool wxSimpleHtmlTag::HasAttribute(const wxString& name, const wxString& value) const
670 wxSimpleHtmlAttribute* attr = FindAttribute(name);
672 return (attr && (attr->GetValue().CmpNoCase(value) == 0)) ;
675 bool wxSimpleHtmlTag::HasAttribute(const wxString& name) const
677 return FindAttribute(name) != NULL ;
680 bool wxSimpleHtmlTag::GetAttributeValue(wxString& value, const wxString& attrName)
682 wxSimpleHtmlAttribute* attr = FindAttribute(attrName);
685 value = attr->GetValue();
692 // Search forward from this tag until we find a tag with this name & attribute
693 wxSimpleHtmlTag* wxSimpleHtmlTag::FindTag(const wxString& tagName, const wxString& attrName)
695 wxSimpleHtmlTag* tag = m_next;
698 if (tag->NameIs(tagName) && tag->FindAttribute(attrName))
706 bool wxSimpleHtmlTag::FindTextUntilTagClose(wxString& text, const wxString& tagName)
708 wxSimpleHtmlTag* tag = this;
711 if (tag->GetType() == wxSimpleHtmlTag_Close && tag->NameIs(tagName))
714 if (tag->GetType() == wxSimpleHtmlTag_Text)
715 text += tag->GetText();
723 wxSimpleHtmlTag* wxSimpleHtmlTag::GetChild(int i) const
726 wxSimpleHtmlTag* tag = m_children;
738 void wxSimpleHtmlTag::Write(wxOutputStream& stream)
742 case wxSimpleHtmlTag_Text:
747 case wxSimpleHtmlTag_Open:
749 stream << "<" << m_name;
750 if (GetAttributeCount() > 0)
753 for (i = 0; i < GetAttributeCount(); i++)
755 wxSimpleHtmlAttribute* attr = GetAttribute(i);
757 if (i < GetAttributeCount() - 1)
763 case wxSimpleHtmlTag_Directive:
765 stream << "<!" << m_name << " ";
767 for (i = 0; i < GetAttributeCount(); i++)
769 wxSimpleHtmlAttribute* attr = GetAttribute(i);
771 if (i < GetAttributeCount() - 1)
777 case wxSimpleHtmlTag_Close:
779 stream << "</" << m_name << ">\n";
787 wxSimpleHtmlTag* tag = m_children;
796 void wxSimpleHtmlAttribute::Write(wxOutputStream& stream)
798 if (m_value.IsEmpty())