Markup.cpp 168 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521
  1. // Markup.cpp: implementation of the CMarkup class.
  2. //
  3. // Markup Release 11.5
  4. // Copyright (C) 2011 First Objective Software, Inc. All rights reserved
  5. // Go to www.firstobject.com for the latest CMarkup and EDOM documentation
  6. // Use in commercial applications requires written permission
  7. // This software is provided "as is", with no warranty.
  8. //
  9. #include "stdafx.h"
  10. #include <stdio.h>
  11. #include "Markup.h"
  12. #if defined(MCD_STRERROR) // C error routine
  13. #include <errno.h>
  14. #endif // C error routine
  15. #if defined (MARKUP_ICONV)
  16. #include <iconv.h>
  17. #endif
  18. #define x_ATTRIBQUOTE '\"' // can be double or single quote
  19. #if defined(MARKUP_STL) && ( defined(MARKUP_WINCONV) || (! defined(MCD_STRERROR)))
  20. #include <windows.h> // for MultiByteToWideChar, WideCharToMultiByte, FormatMessage
  21. #endif // need windows.h when STL and (not setlocale or not strerror), MFC afx.h includes it already
  22. #if defined(MARKUP_MBCS) // MBCS/double byte
  23. #pragma message( "Note: MBCS build (not UTF-8)" )
  24. // For UTF-8, remove MBCS from project settings C/C++ preprocessor definitions
  25. #if defined (MARKUP_WINCONV)
  26. #include <mbstring.h> // for VC++ _mbclen
  27. #endif // WINCONV
  28. #endif // MBCS/double byte
  29. #if defined(_DEBUG) && _MSC_VER > 1000 // VC++ DEBUG
  30. #undef THIS_FILE
  31. static char THIS_FILE[]=__FILE__;
  32. #if defined(DEBUG_NEW)
  33. #define new DEBUG_NEW
  34. #endif // DEBUG_NEW
  35. #endif // VC++ DEBUG
  36. // Disable "while ( 1 )" warning in VC++ 2002
  37. #if _MSC_VER >= 1300 // VC++ 2002 (7.0)
  38. #pragma warning(disable:4127)
  39. #endif // VC++ 2002 (7.0)
  40. //////////////////////////////////////////////////////////////////////
  41. // Internal static utility functions
  42. //
  43. void x_StrInsertReplace( MCD_STR& str, int nLeft, int nReplace, const MCD_STR& strInsert )
  44. {
  45. // Insert strInsert into str at nLeft replacing nReplace chars
  46. // Reduce reallocs on growing string by reserving string space
  47. // If realloc needed, allow for 1.5 times the new length
  48. //
  49. int nStrLength = MCD_STRLENGTH(str);
  50. int nInsLength = MCD_STRLENGTH(strInsert);
  51. int nNewLength = nInsLength + nStrLength - nReplace;
  52. int nAllocLen = MCD_STRCAPACITY(str);
  53. #if defined(MCD_STRINSERTREPLACE) // STL, replace method
  54. if ( nNewLength > nAllocLen )
  55. MCD_BLDRESERVE( str, (nNewLength + nNewLength/2 + 128) );
  56. MCD_STRINSERTREPLACE( str, nLeft, nReplace, strInsert );
  57. #else // MFC, no replace method
  58. int nBufferLen = nNewLength;
  59. if ( nNewLength > nAllocLen )
  60. nBufferLen += nBufferLen/2 + 128;
  61. MCD_CHAR* pDoc = MCD_GETBUFFER( str, nBufferLen );
  62. if ( nInsLength != nReplace && nLeft+nReplace < nStrLength )
  63. memmove( &pDoc[nLeft+nInsLength], &pDoc[nLeft+nReplace], (nStrLength-nLeft-nReplace)*sizeof(MCD_CHAR) );
  64. if ( nInsLength )
  65. memcpy( &pDoc[nLeft], strInsert, nInsLength*sizeof(MCD_CHAR) );
  66. MCD_RELEASEBUFFER( str, pDoc, nNewLength );
  67. #endif // MFC, no replace method
  68. }
  69. int x_Hash( MCD_PCSZ p, int nSize )
  70. {
  71. unsigned int n=0;
  72. while (*p)
  73. n += (unsigned int)(*p++);
  74. return n % nSize;
  75. }
  76. MCD_STR x_IntToStr( int n )
  77. {
  78. MCD_CHAR sz[25];
  79. MCD_SPRINTF(MCD_SSZ(sz),MCD_T("%d"),n);
  80. MCD_STR s=sz;
  81. return s;
  82. }
  83. int x_StrNCmp( MCD_PCSZ p1, MCD_PCSZ p2, int n, int bIgnoreCase = 0 )
  84. {
  85. // Fast string compare to determine equality
  86. if ( bIgnoreCase )
  87. {
  88. bool bNonAsciiFound = false;
  89. MCD_CHAR c1, c2;
  90. while ( n-- )
  91. {
  92. c1 = *p1++;
  93. c2 = *p2++;
  94. if ( c1 != c2 )
  95. {
  96. if ( bNonAsciiFound )
  97. return c1 - c2;
  98. if ( c1 >= 'a' && c1 <= 'z' )
  99. c1 = (MCD_CHAR)( c1 - ('a'-'A') );
  100. if ( c2 >= 'a' && c2 <= 'z' )
  101. c2 = (MCD_CHAR)( c2 - ('a'-'A') );
  102. if ( c1 != c2 )
  103. return c1 - c2;
  104. }
  105. else if ( (unsigned int)c1 > 127 )
  106. bNonAsciiFound = true;
  107. }
  108. }
  109. else
  110. {
  111. while ( n-- )
  112. {
  113. if ( *p1 != *p2 )
  114. return *p1 - *p2;
  115. p1++;
  116. p2++;
  117. }
  118. }
  119. return 0;
  120. }
  121. enum MarkupResultCode
  122. {
  123. MRC_COUNT = 1,
  124. MRC_TYPE = 2,
  125. MRC_NUMBER = 4,
  126. MRC_ENCODING = 8,
  127. MRC_LENGTH = 16,
  128. MRC_MODIFY = 32,
  129. MRC_MSG = 64
  130. };
  131. void x_AddResult( MCD_STR& strResult, MCD_CSTR pszID, MCD_CSTR pszVal = NULL, int nResultCode = 0, int n = -1, int n2 = -1 )
  132. {
  133. // Call this to append an error result to strResult, discard if accumulating too large
  134. if ( MCD_STRLENGTH(strResult) < 1000 )
  135. {
  136. // Use a temporary CMarkup object but keep strResult in a string to minimize memory footprint
  137. CMarkup mResult( strResult );
  138. if ( nResultCode & MRC_MODIFY )
  139. mResult.FindElem( pszID );
  140. else
  141. mResult.AddElem( pszID, MCD_T(""), CMarkup::MNF_WITHNOLINES );
  142. if ( pszVal.pcsz )
  143. {
  144. if ( nResultCode & MRC_TYPE )
  145. mResult.SetAttrib( MCD_T("type"), pszVal );
  146. else if ( nResultCode & MRC_ENCODING )
  147. mResult.SetAttrib( MCD_T("encoding"), pszVal );
  148. else if ( nResultCode & MRC_MSG )
  149. mResult.SetAttrib( MCD_T("msg"), pszVal );
  150. else
  151. mResult.SetAttrib( MCD_T("tagname"), pszVal );
  152. }
  153. if ( nResultCode & MRC_NUMBER )
  154. mResult.SetAttrib( MCD_T("n"), n );
  155. else if ( nResultCode & MRC_COUNT )
  156. mResult.SetAttrib( MCD_T("count"), n );
  157. else if ( nResultCode & MRC_LENGTH )
  158. mResult.SetAttrib( MCD_T("length"), n );
  159. else if ( n != -1 )
  160. mResult.SetAttrib( MCD_T("offset"), n );
  161. if ( n2 != -1 )
  162. mResult.SetAttrib( MCD_T("offset2"), n2 );
  163. strResult = mResult.GetDoc();
  164. }
  165. }
  166. //////////////////////////////////////////////////////////////////////
  167. // Encoding conversion struct and methods
  168. //
  169. struct TextEncoding
  170. {
  171. TextEncoding( MCD_CSTR pszFromEncoding, const void* pFromBuffer, int nFromBufferLen )
  172. {
  173. m_strFromEncoding = pszFromEncoding;
  174. m_pFrom = pFromBuffer;
  175. m_nFromLen = nFromBufferLen;
  176. m_nFailedChars = 0;
  177. m_nToCount = 0;
  178. };
  179. int PerformConversion( void* pTo, MCD_CSTR pszToEncoding = NULL );
  180. bool FindRaggedEnd( int& nTruncBeforeBytes );
  181. #if defined(MARKUP_ICONV)
  182. static const char* IConvName( char* szEncoding, MCD_CSTR pszEncoding );
  183. int IConv( void* pTo, int nToCharSize, int nFromCharSize );
  184. #endif // ICONV
  185. #if ! defined(MARKUP_WCHAR)
  186. static bool CanConvert( MCD_CSTR pszToEncoding, MCD_CSTR pszFromEncoding );
  187. #endif // WCHAR
  188. MCD_STR m_strToEncoding;
  189. MCD_STR m_strFromEncoding;
  190. const void* m_pFrom;
  191. int m_nFromLen;
  192. int m_nToCount;
  193. int m_nFailedChars;
  194. };
  195. // Encoding names
  196. // This is a precompiled ASCII hash table for speed and minimum memory requirement
  197. // Each entry consists of a 2 digit name length, 5 digit code page, and the encoding name
  198. // Each table slot can have multiple entries, table size 155 was chosen for even distribution
  199. //
  200. MCD_PCSZ EncodingNameTable[155] =
  201. {
  202. MCD_T("0800949ksc_5601"),MCD_T("1920932cseucpkdfmtjapanese0920003x-cp20003"),
  203. MCD_T("1250221_iso-2022-jp0228591l10920004x-cp20004"),
  204. MCD_T("0228592l20920005x-cp20005"),
  205. MCD_T("0228593l30600850ibm8501000858ccsid00858"),
  206. MCD_T("0228594l40600437ibm4370701201ucs-2be0600860ibm860"),
  207. MCD_T("0600852ibm8520501250ms-ee0600861ibm8610228599l50751932cp51932"),
  208. MCD_T("0600862ibm8620620127ibm3670700858cp008581010021x-mac-thai0920261x-cp20261"),
  209. MCD_T("0600737ibm7370500869cp-gr1057003x-iscii-be0600863ibm863"),
  210. MCD_T("0750221ms502210628591ibm8190600855ibm8550600864ibm864"),
  211. MCD_T("0600775ibm7751057002x-iscii-de0300949uhc0228605l91028591iso-ir-1000600865ibm865"),
  212. MCD_T("1028594iso-ir-1101028592iso-ir-1010600866ibm8660500861cp-is0600857ibm857"),
  213. MCD_T("0950227x-cp50227"),
  214. MCD_T("0320866koi1628598csisolatinhebrew1057008x-iscii-ka"),
  215. MCD_T("1000950big5-hkscs1220106x-ia5-german0600869ibm869"),
  216. MCD_T("1057009x-iscii-ma0701200ucs-2le0712001utf32be0920269x-cp20269"),
  217. MCD_T("0800708asmo-7080500437cspc81765000unicode-1-1-utf-70612000utf-320920936x-cp20936"),
  218. MCD_T("1200775ebcdic-cp-be0628598hebrew0701201utf16be1765001unicode-1-1-utf-81765001unicode-2-0-utf-80551932x-euc"),
  219. MCD_T("1028595iso-ir-1441028597iso-ir-1260728605latin-90601200utf-161057011x-iscii-pa"),
  220. MCD_T("1028596iso-ir-1271028593iso-ir-1090751932ms51932"),
  221. MCD_T("0801253ms-greek0600949korean1050225iso2022-kr1128605iso_8859-150920949x-cp20949"),
  222. MCD_T("1200775ebcdic-cp-ch1028598iso-ir-1381057006x-iscii-as1450221iso-2022-jp-ms"),
  223. MCD_T("1057004x-iscii-ta1028599iso-ir-148"),
  224. MCD_T("1000949iso-ir-1490820127us-ascii"),MCD_T(""),
  225. MCD_T("1000936gb_2312-801900850cspc850multilingual0712000utf32le"),
  226. MCD_T("1057005x-iscii-te1300949csksc560119871965000x-unicode-2-0-utf-7"),
  227. MCD_T("0701200utf16le1965001x-unicode-2-0-utf-80928591iso8859-1"),
  228. MCD_T("0928592iso8859-21420002x_chinese-eten0520866koi8r1000932x-ms-cp932"),
  229. MCD_T("1320000x-chinese-cns1138598iso8859-8-i1057010x-iscii-gu0928593iso8859-3"),
  230. MCD_T("0928594iso8859-4"),MCD_T("0928595iso8859-51150221csiso2022jp"),
  231. MCD_T("0928596iso8859-60900154csptcp154"),
  232. MCD_T("0928597iso8859-70900932shift_jis1400154cyrillic-asian"),
  233. MCD_T("0928598iso8859-81057007x-iscii-or1150225csiso2022kr"),
  234. MCD_T("0721866koi8-ru0928599iso8859-9"),MCD_T("0910000macintosh"),MCD_T(""),
  235. MCD_T(""),MCD_T(""),
  236. MCD_T("1210004x-mac-arabic0800936gb2312800628598visual1520108x-ia5-norwegian"),
  237. MCD_T(""),MCD_T("0829001x-europa"),MCD_T(""),MCD_T("1510079x-mac-icelandic"),
  238. MCD_T("0800932sjis-win1128591csisolatin1"),MCD_T("1128592csisolatin2"),
  239. MCD_T("1400949ks_c_5601-19871128593csisolatin3"),MCD_T("1128594csisolatin4"),
  240. MCD_T("0400950big51128595csisolatin51400949ks_c_5601-1989"),
  241. MCD_T("0500775cp5001565000csunicode11utf7"),MCD_T("0501361johab"),
  242. MCD_T("1100932windows-9321100437codepage437"),
  243. MCD_T("1800862cspc862latinhebrew1310081x-mac-turkish"),MCD_T(""),
  244. MCD_T("0701256ms-arab0800775csibm5000500154cp154"),
  245. MCD_T("1100936windows-9360520127ascii"),
  246. MCD_T("1528597csisolatingreek1100874windows-874"),MCD_T("0500850cp850"),
  247. MCD_T("0700720dos-7200500950cp9500500932cp9320500437cp4370500860cp8601650222_iso-2022-jp$sio"),
  248. MCD_T("0500852cp8520500861cp8610700949ksc56010812001utf-32be"),
  249. MCD_T("0528597greek0500862cp8620520127cp3670500853cp853"),
  250. MCD_T("0500737cp7371150220iso-2022-jp0801201utf-16be0500863cp863"),
  251. MCD_T("0500936cp9360528591cp8194520932extended_unix_code_packed_format_for_japanese0500855cp8550500864cp864"),
  252. MCD_T("0500775cp7750500874cp8740800860csibm8600500865cp865"),
  253. MCD_T("0500866cp8660800861csibm8611150225iso-2022-kr0500857cp8571101201unicodefffe"),
  254. MCD_T("0700862dos-8620701255ms-hebr0500858cp858"),
  255. MCD_T("1210005x-mac-hebrew0500949cp9490800863csibm863"),
  256. MCD_T("0500869cp8691600437cspc8codepage4370700874tis-6200800855csibm8550800864csibm864"),
  257. MCD_T("0800950x-x-big50420866koi80800932ms_kanji0700874dos-8740800865csibm865"),
  258. MCD_T("0800866csibm8661210003x-mac-korean0800857csibm8570812000utf-32le"),
  259. MCD_T(""),MCD_T("0500932ms9320801200utf-16le1028591iso-8859-10500154pt154"),
  260. MCD_T("1028592iso-8859-20620866koi8-r0800869csibm869"),
  261. MCD_T("1500936csiso58gb2312800828597elot_9281238598iso-8859-8-i1028593iso-8859-30820127iso-ir-6"),
  262. MCD_T("1028594iso-8859-4"),
  263. MCD_T("0800852cspcp8520500936ms9361028595iso-8859-50621866koi8-u0701252ms-ansi"),
  264. MCD_T("1028596iso-8859-60220127us2400858pc-multilingual-850+euro"),
  265. MCD_T("1028597iso-8859-71028603iso8859-13"),
  266. MCD_T("1320000x-chinese_cns1028598iso-8859-8"),
  267. MCD_T("1828595csisolatincyrillic1028605iso8859-151028599iso-8859-9"),
  268. MCD_T("0465001utf8"),MCD_T("1510017x-mac-ukrainian"),MCD_T(""),
  269. MCD_T("0828595cyrillic"),MCD_T("0900936gb2312-80"),MCD_T(""),
  270. MCD_T("0720866cskoi8r1528591iso_8859-1:1987"),MCD_T("1528592iso_8859-2:1987"),
  271. MCD_T("1354936iso-4873:1986"),MCD_T("0700932sjis-ms1528593iso_8859-3:1988"),
  272. MCD_T("1528594iso_8859-4:19880600936gb23120701251ms-cyrl"),
  273. MCD_T("1528596iso_8859-6:19871528595iso_8859-5:1988"),
  274. MCD_T("1528597iso_8859-7:1987"),
  275. MCD_T("1201250windows-12501300932shifft_jis-ms"),
  276. MCD_T("0810029x-mac-ce1201251windows-12511528598iso_8859-8:19880900949ks_c_56011110000csmacintosh"),
  277. MCD_T("0601200cp12001201252windows-1252"),
  278. MCD_T("1052936hz-gb-23121201253windows-12531400949ks_c_5601_19871528599iso_8859-9:19890601201cp1201"),
  279. MCD_T("1201254windows-1254"),MCD_T("1000936csgb2312801201255windows-1255"),
  280. MCD_T("1201256windows-12561100932windows-31j"),
  281. MCD_T("1201257windows-12570601250cp12500601133cp1133"),
  282. MCD_T("0601251cp12511201258windows-12580601125cp1125"),
  283. MCD_T("0701254ms-turk0601252cp1252"),MCD_T("0601253cp12530601361cp1361"),
  284. MCD_T("0800949ks-c56010601254cp1254"),MCD_T("0651936euc-cn0601255cp1255"),
  285. MCD_T("0601256cp1256"),MCD_T("0601257cp12570600950csbig50800858ibm00858"),
  286. MCD_T("0601258cp1258"),MCD_T("0520105x-ia5"),
  287. MCD_T("0801250x-cp12501110006x-mac-greek0738598logical"),
  288. MCD_T("0801251x-cp1251"),MCD_T(""),
  289. MCD_T("1410001x-mac-japanese1200932cswindows31j"),
  290. MCD_T("0700936chinese0720127csascii0620932euc-jp"),
  291. MCD_T("0851936x-euc-cn0501200ucs-2"),MCD_T("0628597greek8"),
  292. MCD_T("0651949euc-kr"),MCD_T(""),MCD_T("0628591latin1"),
  293. MCD_T("0628592latin21100874iso-8859-11"),
  294. MCD_T("0628593latin31420127ansi_x3.4-19681420127ansi_x3.4-19861028591iso_8859-1"),
  295. MCD_T("0628594latin41028592iso_8859-20701200unicode1128603iso-8859-13"),
  296. MCD_T("1028593iso_8859-30628599latin51410082x-mac-croatian"),
  297. MCD_T("1028594iso_8859-41128605iso-8859-150565000utf-70851932x-euc-jp"),
  298. MCD_T("1300775cspc775baltic1028595iso_8859-50565001utf-80512000utf32"),
  299. MCD_T("1028596iso_8859-61710002x-mac-chinesetrad0601252x-ansi"),
  300. MCD_T("1028597iso_8859-70628605latin90501200utf160700154ptcp1541410010x-mac-romanian"),
  301. MCD_T("0900936iso-ir-581028598iso_8859-8"),MCD_T("1028599iso_8859-9"),
  302. MCD_T("1350221iso2022-jp-ms0400932sjis"),MCD_T("0751949cseuckr"),
  303. MCD_T("1420002x-chinese-eten"),MCD_T("1410007x-mac-cyrillic"),
  304. MCD_T("1000932shifft_jis"),MCD_T("0828596ecma-114"),MCD_T(""),
  305. MCD_T("0900932shift-jis"),MCD_T("0701256cp1256 1320107x-ia5-swedish"),
  306. MCD_T("0828597ecma-118"),
  307. MCD_T("1628596csisolatinarabic1710008x-mac-chinesesimp0600932x-sjis"),MCD_T(""),
  308. MCD_T("0754936gb18030"),MCD_T("1350221windows-502210712000cp12000"),
  309. MCD_T("0628596arabic0500936cn-gb0900932sjis-open0712001cp12001"),MCD_T(""),
  310. MCD_T(""),MCD_T("0700950cn-big50920127iso646-us1001133ibm-cp1133"),MCD_T(""),
  311. MCD_T("0800936csgb23120900949ks-c-56010310000mac"),
  312. MCD_T("1001257winbaltrim0750221cp502211020127iso-ir-6us"),
  313. MCD_T("1000932csshiftjis"),MCD_T("0300936gbk0765001cp65001"),
  314. MCD_T("1620127iso_646.irv:19911351932windows-519320920001x-cp20001")
  315. };
  316. int x_GetEncodingCodePage( MCD_CSTR pszEncoding )
  317. {
  318. // redo for completeness, the iconv set, UTF-32, and uppercase
  319. // Lookup strEncoding in EncodingNameTable and return Windows code page
  320. int nCodePage = -1;
  321. int nEncLen = MCD_PSZLEN( pszEncoding );
  322. if ( ! nEncLen )
  323. nCodePage = MCD_ACP;
  324. else if ( x_StrNCmp(pszEncoding,MCD_T("UTF-32"),6) == 0 )
  325. nCodePage = MCD_UTF32;
  326. else if ( nEncLen < 100 )
  327. {
  328. MCD_CHAR szEncodingLower[100];
  329. for ( int nEncChar=0; nEncChar<nEncLen; ++nEncChar )
  330. {
  331. MCD_CHAR cEncChar = pszEncoding[nEncChar];
  332. szEncodingLower[nEncChar] = (cEncChar>='A' && cEncChar<='Z')? (MCD_CHAR)(cEncChar+('a'-'A')) : cEncChar;
  333. }
  334. szEncodingLower[nEncLen] = '\0';
  335. MCD_PCSZ pEntry = EncodingNameTable[x_Hash(szEncodingLower,sizeof(EncodingNameTable)/sizeof(MCD_PCSZ))];
  336. while ( *pEntry )
  337. {
  338. // e.g. entry: 0565001utf-8 means length 05, code page 65001, encoding name utf-8
  339. int nEntryLen = (*pEntry - '0') * 10;
  340. ++pEntry;
  341. nEntryLen += (*pEntry - '0');
  342. ++pEntry;
  343. MCD_PCSZ pCodePage = pEntry;
  344. pEntry += 5;
  345. if ( nEntryLen == nEncLen && x_StrNCmp(szEncodingLower,pEntry,nEntryLen) == 0 )
  346. {
  347. // Convert digits to integer up to code name which always starts with alpha
  348. nCodePage = MCD_PSZTOL( pCodePage, NULL, 10 );
  349. break;
  350. }
  351. pEntry += nEntryLen;
  352. }
  353. }
  354. return nCodePage;
  355. }
  356. #if ! defined(MARKUP_WCHAR)
  357. bool TextEncoding::CanConvert( MCD_CSTR pszToEncoding, MCD_CSTR pszFromEncoding )
  358. {
  359. // Return true if MB to MB conversion is possible
  360. #if defined(MARKUP_ICONV)
  361. // iconv_open should fail if either encoding not supported or one is alias for other
  362. char szTo[100], szFrom[100];
  363. iconv_t cd = iconv_open( IConvName(szTo,pszToEncoding), IConvName(szFrom,pszFromEncoding) );
  364. if ( cd == (iconv_t)-1 )
  365. return false;
  366. iconv_close(cd);
  367. #else
  368. int nToCP = x_GetEncodingCodePage( pszToEncoding );
  369. int nFromCP = x_GetEncodingCodePage( pszFromEncoding );
  370. if ( nToCP == -1 || nFromCP == -1 )
  371. return false;
  372. #if defined(MARKUP_WINCONV)
  373. if ( nToCP == MCD_ACP || nFromCP == MCD_ACP ) // either ACP ANSI?
  374. {
  375. int nACP = GetACP();
  376. if ( nToCP == MCD_ACP )
  377. nToCP = nACP;
  378. if ( nFromCP == MCD_ACP )
  379. nFromCP = nACP;
  380. }
  381. #else // no conversion API, but we can do AToUTF8 and UTF8ToA
  382. if ( nToCP != MCD_UTF8 && nFromCP != MCD_UTF8 ) // either UTF-8?
  383. return false;
  384. #endif // no conversion API
  385. if ( nToCP == nFromCP )
  386. return false;
  387. #endif // not ICONV
  388. return true;
  389. }
  390. #endif // not WCHAR
  391. #if defined(MARKUP_ICONV)
  392. const char* TextEncoding::IConvName( char* szEncoding, MCD_CSTR pszEncoding )
  393. {
  394. // Make upper case char-based name from strEncoding which consists only of characters in the ASCII range
  395. int nEncChar = 0;
  396. while ( pszEncoding[nEncChar] )
  397. {
  398. char cEncChar = (char)pszEncoding[nEncChar];
  399. szEncoding[nEncChar] = (cEncChar>='a' && cEncChar<='z')? (cEncChar-('a'-'A')) : cEncChar;
  400. ++nEncChar;
  401. }
  402. if ( nEncChar == 6 && x_StrNCmp(szEncoding,"UTF-16",6) == 0 )
  403. {
  404. szEncoding[nEncChar++] = 'B';
  405. szEncoding[nEncChar++] = 'E';
  406. }
  407. szEncoding[nEncChar] = '\0';
  408. return szEncoding;
  409. }
  410. int TextEncoding::IConv( void* pTo, int nToCharSize, int nFromCharSize )
  411. {
  412. // Converts from m_pFrom to pTo
  413. char szTo[100], szFrom[100];
  414. iconv_t cd = iconv_open( IConvName(szTo,m_strToEncoding), IConvName(szFrom,m_strFromEncoding) );
  415. int nToLenBytes = 0;
  416. if ( cd != (iconv_t)-1 )
  417. {
  418. size_t nFromLenRemaining = (size_t)m_nFromLen * nFromCharSize;
  419. size_t nToCountRemaining = (size_t)m_nToCount * nToCharSize;
  420. size_t nToCountRemainingBefore;
  421. char* pToChar = (char*)pTo;
  422. char* pFromChar = (char*)m_pFrom;
  423. char* pToTempBuffer = NULL;
  424. const size_t nTempBufferSize = 2048;
  425. size_t nResult;
  426. if ( ! pTo )
  427. {
  428. pToTempBuffer = new char[nTempBufferSize];
  429. pToChar = pToTempBuffer;
  430. nToCountRemaining = nTempBufferSize;
  431. }
  432. while ( nFromLenRemaining )
  433. {
  434. nToCountRemainingBefore = nToCountRemaining;
  435. nResult = iconv( cd, &pFromChar, &nFromLenRemaining, &pToChar, &nToCountRemaining );
  436. nToLenBytes += (int)(nToCountRemainingBefore - nToCountRemaining);
  437. if ( nResult == (size_t)-1 )
  438. {
  439. int nErrno = errno;
  440. if ( nErrno == EILSEQ )
  441. {
  442. // Bypass bad char, question mark denotes problem in source string
  443. pFromChar += nFromCharSize;
  444. nFromLenRemaining -= nFromCharSize;
  445. if ( nToCharSize == 1 )
  446. *pToChar = '?';
  447. else if ( nToCharSize == 2 )
  448. *((unsigned short*)pToChar) = (unsigned short)'?';
  449. else if ( nToCharSize == 4 )
  450. *((unsigned int*)pToChar) = (unsigned int)'?';
  451. pToChar += nToCharSize;
  452. nToCountRemaining -= nToCharSize;
  453. nToLenBytes += nToCharSize;
  454. size_t nInitFromLen = 0, nInitToCount = 0;
  455. iconv(cd, NULL, &nInitFromLen ,NULL, &nInitToCount );
  456. }
  457. else if ( nErrno == EINVAL )
  458. break; // incomplete character or shift sequence at end of input
  459. else if ( nErrno == E2BIG && !pToTempBuffer )
  460. break; // output buffer full should only happen when using a temp buffer
  461. }
  462. else
  463. m_nFailedChars += nResult;
  464. if ( pToTempBuffer && nToCountRemaining < 10 )
  465. {
  466. nToCountRemaining = nTempBufferSize;
  467. pToChar = pToTempBuffer;
  468. }
  469. }
  470. if ( pToTempBuffer )
  471. delete[] pToTempBuffer;
  472. iconv_close(cd);
  473. }
  474. return nToLenBytes / nToCharSize;
  475. }
  476. #endif
  477. #if defined(MARKUP_WINCONV)
  478. bool x_NoDefaultChar( int nCP )
  479. {
  480. // WideCharToMultiByte fails if lpUsedDefaultChar is non-NULL for these code pages:
  481. return (bool)(nCP == 65000 || nCP == 65001 || nCP == 50220 || nCP == 50221 || nCP == 50222 || nCP == 50225 ||
  482. nCP == 50227 || nCP == 50229 || nCP == 52936 || nCP == 54936 || (nCP >= 57002 && nCP <= 57011) );
  483. }
  484. #endif
  485. int TextEncoding::PerformConversion( void* pTo, MCD_CSTR pszToEncoding/*=NULL*/ )
  486. {
  487. // If pTo is not NULL, it must be large enough to hold result, length of result is returned
  488. // m_nFailedChars will be set to >0 if characters not supported in strToEncoding
  489. int nToLen = 0;
  490. if ( pszToEncoding.pcsz )
  491. m_strToEncoding = pszToEncoding;
  492. int nToCP = x_GetEncodingCodePage( m_strToEncoding );
  493. if ( nToCP == -1 )
  494. nToCP = MCD_ACP;
  495. int nFromCP = x_GetEncodingCodePage( m_strFromEncoding );
  496. if ( nFromCP == -1 )
  497. nFromCP = MCD_ACP;
  498. m_nFailedChars = 0;
  499. #if ! defined(MARKUP_WINCONV) && ! defined(MARKUP_ICONV)
  500. // Only non-Unicode encoding supported is locale charset, must call setlocale
  501. if ( nToCP != MCD_UTF8 && nToCP != MCD_UTF16 && nToCP != MCD_UTF32 )
  502. nToCP = MCD_ACP;
  503. if ( nFromCP != MCD_UTF8 && nFromCP != MCD_UTF16 && nFromCP != MCD_UTF32 )
  504. nFromCP = MCD_ACP;
  505. if ( nFromCP == MCD_ACP )
  506. {
  507. const char* pA = (const char*)m_pFrom;
  508. int nALenRemaining = m_nFromLen;
  509. int nCharLen;
  510. wchar_t wcChar;
  511. char* pU = (char*)pTo;
  512. while ( nALenRemaining )
  513. {
  514. nCharLen = mbtowc( &wcChar, pA, nALenRemaining );
  515. if ( nCharLen < 1 )
  516. {
  517. wcChar = (wchar_t)'?';
  518. nCharLen = 1;
  519. }
  520. pA += nCharLen;
  521. nALenRemaining -= nCharLen;
  522. if ( nToCP == MCD_UTF8 )
  523. CMarkup::EncodeCharUTF8( (int)wcChar, pU, nToLen );
  524. else if ( nToCP == MCD_UTF16 )
  525. CMarkup::EncodeCharUTF16( (int)wcChar, (unsigned short*)pU, nToLen );
  526. else // UTF32
  527. {
  528. if ( pU )
  529. ((unsigned int*)pU)[nToLen] = (unsigned int)wcChar;
  530. ++nToLen;
  531. }
  532. }
  533. }
  534. else if ( nToCP == MCD_ACP )
  535. {
  536. union pUnicodeUnion { const char* p8; const unsigned short* p16; const unsigned int* p32; } pU;
  537. pU.p8 = (const char*)m_pFrom;
  538. const char* pUEnd = pU.p8 + m_nFromLen;
  539. if ( nFromCP == MCD_UTF16 )
  540. pUEnd = (char*)( pU.p16 + m_nFromLen );
  541. else if ( nFromCP == MCD_UTF32 )
  542. pUEnd = (char*)( pU.p32 + m_nFromLen );
  543. int nCharLen;
  544. char* pA = (char*)pTo;
  545. char szA[8];
  546. int nUChar;
  547. while ( pU.p8 != pUEnd )
  548. {
  549. if ( nFromCP == MCD_UTF8 )
  550. nUChar = CMarkup::DecodeCharUTF8( pU.p8, pUEnd );
  551. else if ( nFromCP == MCD_UTF16 )
  552. nUChar = CMarkup::DecodeCharUTF16( pU.p16, (const unsigned short*)pUEnd );
  553. else // UTF32
  554. nUChar = *(pU.p32)++;
  555. if ( nUChar == -1 )
  556. nCharLen = -2;
  557. else if ( nUChar & ~0xffff )
  558. nCharLen = -1;
  559. else
  560. nCharLen = wctomb( pA?pA:szA, (wchar_t)nUChar );
  561. if ( nCharLen < 0 )
  562. {
  563. if ( nCharLen == -1 )
  564. ++m_nFailedChars;
  565. nCharLen = 1;
  566. if ( pA )
  567. *pA = '?';
  568. }
  569. if ( pA )
  570. pA += nCharLen;
  571. nToLen += nCharLen;
  572. }
  573. }
  574. #endif // not WINCONV and not ICONV
  575. if ( nFromCP == MCD_UTF32 )
  576. {
  577. const unsigned int* p32 = (const unsigned int*)m_pFrom;
  578. const unsigned int* p32End = p32 + m_nFromLen;
  579. if ( nToCP == MCD_UTF8 )
  580. {
  581. char* p8 = (char*)pTo;
  582. while ( p32 != p32End )
  583. CMarkup::EncodeCharUTF8( *p32++, p8, nToLen );
  584. }
  585. else if ( nToCP == MCD_UTF16 )
  586. {
  587. unsigned short* p16 = (unsigned short*)pTo;
  588. while ( p32 != p32End )
  589. CMarkup::EncodeCharUTF16( (int)*p32++, p16, nToLen );
  590. }
  591. else // to ANSI
  592. {
  593. // WINCONV not supported for 32To8, since only used for sizeof(wchar_t) == 4
  594. #if defined(MARKUP_ICONV)
  595. nToLen = IConv( pTo, 1, 4 );
  596. #endif // ICONV
  597. }
  598. }
  599. else if ( nFromCP == MCD_UTF16 )
  600. {
  601. // UTF16To8 will be deprecated since weird output buffer size sensitivity not worth implementing here
  602. const unsigned short* p16 = (const unsigned short*)m_pFrom;
  603. const unsigned short* p16End = p16 + m_nFromLen;
  604. int nUChar;
  605. if ( nToCP == MCD_UTF32 )
  606. {
  607. unsigned int* p32 = (unsigned int*)pTo;
  608. while ( p16 != p16End )
  609. {
  610. nUChar = CMarkup::DecodeCharUTF16( p16, p16End );
  611. if ( nUChar == -1 )
  612. nUChar = '?';
  613. if ( p32 )
  614. p32[nToLen] = (unsigned int)nUChar;
  615. ++nToLen;
  616. }
  617. }
  618. #if defined(MARKUP_WINCONV)
  619. else // to UTF-8 or other multi-byte
  620. {
  621. nToLen = WideCharToMultiByte(nToCP,0,(const wchar_t*)m_pFrom,m_nFromLen,(char*)pTo,
  622. m_nToCount?m_nToCount+1:0,NULL,x_NoDefaultChar(nToCP)?NULL:&m_nFailedChars);
  623. }
  624. #else // not WINCONV
  625. else if ( nToCP == MCD_UTF8 )
  626. {
  627. char* p8 = (char*)pTo;
  628. while ( p16 != p16End )
  629. {
  630. nUChar = CMarkup::DecodeCharUTF16( p16, p16End );
  631. if ( nUChar == -1 )
  632. nUChar = '?';
  633. CMarkup::EncodeCharUTF8( nUChar, p8, nToLen );
  634. }
  635. }
  636. else // to ANSI
  637. {
  638. #if defined(MARKUP_ICONV)
  639. nToLen = IConv( pTo, 1, 2 );
  640. #endif // ICONV
  641. }
  642. #endif // not WINCONV
  643. }
  644. else if ( nToCP == MCD_UTF16 ) // to UTF-16 from UTF-8/ANSI
  645. {
  646. #if defined(MARKUP_WINCONV)
  647. nToLen = MultiByteToWideChar(nFromCP,0,(const char*)m_pFrom,m_nFromLen,(wchar_t*)pTo,m_nToCount);
  648. #else // not WINCONV
  649. if ( nFromCP == MCD_UTF8 )
  650. {
  651. const char* p8 = (const char*)m_pFrom;
  652. const char* p8End = p8 + m_nFromLen;
  653. int nUChar;
  654. unsigned short* p16 = (unsigned short*)pTo;
  655. while ( p8 != p8End )
  656. {
  657. nUChar = CMarkup::DecodeCharUTF8( p8, p8End );
  658. if ( nUChar == -1 )
  659. nUChar = '?';
  660. if ( p16 )
  661. p16[nToLen] = (unsigned short)nUChar;
  662. ++nToLen;
  663. }
  664. }
  665. else // from ANSI
  666. {
  667. #if defined(MARKUP_ICONV)
  668. nToLen = IConv( pTo, 2, 1 );
  669. #endif // ICONV
  670. }
  671. #endif // not WINCONV
  672. }
  673. else if ( nToCP == MCD_UTF32 ) // to UTF-32 from UTF-8/ANSI
  674. {
  675. if ( nFromCP == MCD_UTF8 )
  676. {
  677. const char* p8 = (const char*)m_pFrom;
  678. const char* p8End = p8 + m_nFromLen;
  679. int nUChar;
  680. unsigned int* p32 = (unsigned int*)pTo;
  681. while ( p8 != p8End )
  682. {
  683. nUChar = CMarkup::DecodeCharUTF8( p8, p8End );
  684. if ( nUChar == -1 )
  685. nUChar = '?';
  686. if ( p32 )
  687. p32[nToLen] = (unsigned int)nUChar;
  688. ++nToLen;
  689. }
  690. }
  691. else // from ANSI
  692. {
  693. // WINCONV not supported for ATo32, since only used for sizeof(wchar_t) == 4
  694. #if defined(MARKUP_ICONV)
  695. // nToLen = IConv( pTo, 4, 1 );
  696. // Linux: had trouble getting IConv to leave the BOM off of the UTF-32 output stream
  697. // So converting via UTF-16 with native endianness
  698. unsigned short* pwszUTF16 = new unsigned short[m_nFromLen];
  699. MCD_STR strToEncoding = m_strToEncoding;
  700. m_strToEncoding = MCD_T("UTF-16BE");
  701. short nEndianTest = 1;
  702. if ( ((char*)&nEndianTest)[0] ) // Little-endian?
  703. m_strToEncoding = MCD_T("UTF-16LE");
  704. m_nToCount = m_nFromLen;
  705. int nUTF16Len = IConv( pwszUTF16, 2, 1 );
  706. m_strToEncoding = strToEncoding;
  707. const unsigned short* p16 = (const unsigned short*)pwszUTF16;
  708. const unsigned short* p16End = p16 + nUTF16Len;
  709. int nUChar;
  710. unsigned int* p32 = (unsigned int*)pTo;
  711. while ( p16 != p16End )
  712. {
  713. nUChar = CMarkup::DecodeCharUTF16( p16, p16End );
  714. if ( nUChar == -1 )
  715. nUChar = '?';
  716. if ( p32 )
  717. *p32++ = (unsigned int)nUChar;
  718. ++nToLen;
  719. }
  720. delete[] pwszUTF16;
  721. #endif // ICONV
  722. }
  723. }
  724. else
  725. {
  726. #if defined(MARKUP_ICONV)
  727. nToLen = IConv( pTo, 1, 1 );
  728. #elif defined(MARKUP_WINCONV)
  729. wchar_t* pwszUTF16 = new wchar_t[m_nFromLen];
  730. int nUTF16Len = MultiByteToWideChar(nFromCP,0,(const char*)m_pFrom,m_nFromLen,pwszUTF16,m_nFromLen);
  731. nToLen = WideCharToMultiByte(nToCP,0,pwszUTF16,nUTF16Len,(char*)pTo,m_nToCount,NULL,
  732. x_NoDefaultChar(nToCP)?NULL:&m_nFailedChars);
  733. delete[] pwszUTF16;
  734. #endif // WINCONV
  735. }
  736. // Store the length in case this is called again after allocating output buffer to fit
  737. m_nToCount = nToLen;
  738. return nToLen;
  739. }
  740. bool TextEncoding::FindRaggedEnd( int& nTruncBeforeBytes )
  741. {
  742. // Check for ragged end UTF-16 or multi-byte according to m_strToEncoding, expects at least 40 bytes to work with
  743. bool bSuccess = true;
  744. nTruncBeforeBytes = 0;
  745. int nCP = x_GetEncodingCodePage( m_strFromEncoding );
  746. if ( nCP == MCD_UTF16 )
  747. {
  748. unsigned short* pUTF16Buffer = (unsigned short*)m_pFrom;
  749. const unsigned short* pUTF16Last = &pUTF16Buffer[m_nFromLen-1];
  750. if ( CMarkup::DecodeCharUTF16(pUTF16Last,&pUTF16Buffer[m_nFromLen]) == -1 )
  751. nTruncBeforeBytes = 2;
  752. }
  753. else // UTF-8, SBCS DBCS
  754. {
  755. if ( nCP == MCD_UTF8 )
  756. {
  757. char* pUTF8Buffer = (char*)m_pFrom;
  758. char* pUTF8End = &pUTF8Buffer[m_nFromLen];
  759. int nLast = m_nFromLen - 1;
  760. const char* pUTF8Last = &pUTF8Buffer[nLast];
  761. while ( nLast > 0 && CMarkup::DecodeCharUTF8(pUTF8Last,pUTF8End) == -1 )
  762. pUTF8Last = &pUTF8Buffer[--nLast];
  763. nTruncBeforeBytes = (int)(pUTF8End - pUTF8Last);
  764. }
  765. else
  766. {
  767. // Do a conversion-based test unless we can determine it is not multi-byte
  768. // If m_strEncoding="" default code page then GetACP can tell us the code page, otherwise just do the test
  769. #if defined(MARKUP_WINCONV)
  770. if ( nCP == 0 )
  771. nCP = GetACP();
  772. #endif
  773. int nMultibyteCharsToTest = 2;
  774. switch ( nCP )
  775. {
  776. case 54936:
  777. nMultibyteCharsToTest = 4;
  778. case 932: case 51932: case 20932: case 50220: case 50221: case 50222: case 10001: // Japanese
  779. case 949: case 51949: case 50225: case 1361: case 10003: case 20949: // Korean
  780. case 874: case 20001: case 20004: case 10021: case 20003: // Taiwan
  781. case 50930: case 50939: case 50931: case 50933: case 20833: case 50935: case 50937: // EBCDIC
  782. case 936: case 51936: case 20936: case 52936: // Chinese
  783. case 950: case 50227: case 10008: case 20000: case 20002: case 10002: // Chinese
  784. nCP = 0;
  785. break;
  786. }
  787. if ( nMultibyteCharsToTest > m_nFromLen )
  788. nMultibyteCharsToTest = m_nFromLen;
  789. if ( nCP == 0 && nMultibyteCharsToTest )
  790. {
  791. /*
  792. 1. convert the piece to Unicode with MultiByteToWideChar
  793. 2. Identify at least two Unicode code point boundaries at the end of
  794. the converted piece by stepping backwards from the end and re-
  795. converting the final 2 bytes, 3 bytes, 4 bytes etc, comparing the
  796. converted end string to the end of the entire converted piece to find
  797. a valid code point boundary.
  798. 3. Upon finding a code point boundary, I still want to make sure it
  799. will convert the same separately on either side of the divide as it
  800. does together, so separately convert the first byte and the remaining
  801. bytes and see if the result together is the same as the whole end, if
  802. not try the first two bytes and the remaining bytes. etc., until I
  803. find a useable dividing point. If none found, go back to step 2 and
  804. get a longer end string to try.
  805. */
  806. m_strToEncoding = MCD_T("UTF-16");
  807. m_nToCount = m_nFromLen*2;
  808. unsigned short* pUTF16Buffer = new unsigned short[m_nToCount];
  809. int nUTF16Len = PerformConversion( (void*)pUTF16Buffer );
  810. int nOriginalByteLen = m_nFromLen;
  811. // Guaranteed to have at least MARKUP_FILEBLOCKSIZE/2 bytes to work with
  812. const int nMaxBytesToTry = 40;
  813. unsigned short wsz16End[nMaxBytesToTry*2];
  814. unsigned short wsz16EndDivided[nMaxBytesToTry*2];
  815. const char* pszOriginalBytes = (const char*)m_pFrom;
  816. int nBoundariesFound = 0;
  817. bSuccess = false;
  818. while ( nTruncBeforeBytes < nMaxBytesToTry && ! bSuccess )
  819. {
  820. ++nTruncBeforeBytes;
  821. m_pFrom = &pszOriginalBytes[nOriginalByteLen-nTruncBeforeBytes];
  822. m_nFromLen = nTruncBeforeBytes;
  823. m_nToCount = nMaxBytesToTry*2;
  824. int nEndUTF16Len = PerformConversion( (void*)wsz16End );
  825. if ( nEndUTF16Len && memcmp(wsz16End,&pUTF16Buffer[nUTF16Len-nEndUTF16Len],nEndUTF16Len*2) == 0 )
  826. {
  827. ++nBoundariesFound;
  828. if ( nBoundariesFound > 2 )
  829. {
  830. int nDivideAt = 1;
  831. while ( nDivideAt < nTruncBeforeBytes )
  832. {
  833. m_pFrom = &pszOriginalBytes[nOriginalByteLen-nTruncBeforeBytes];
  834. m_nFromLen = nDivideAt;
  835. m_nToCount = nMaxBytesToTry*2;
  836. int nDividedUTF16Len = PerformConversion( (void*)wsz16EndDivided );
  837. if ( nDividedUTF16Len )
  838. {
  839. m_pFrom = &pszOriginalBytes[nOriginalByteLen-nTruncBeforeBytes+nDivideAt];
  840. m_nFromLen = nTruncBeforeBytes-nDivideAt;
  841. m_nToCount = nMaxBytesToTry*2-nDividedUTF16Len;
  842. nDividedUTF16Len += PerformConversion( (void*)&wsz16EndDivided[nDividedUTF16Len] );
  843. if ( m_nToCount && nEndUTF16Len == nDividedUTF16Len && memcmp(wsz16End,wsz16EndDivided,nEndUTF16Len) == 0 )
  844. {
  845. nTruncBeforeBytes -= nDivideAt;
  846. bSuccess = true;
  847. break;
  848. }
  849. }
  850. ++nDivideAt;
  851. }
  852. }
  853. }
  854. }
  855. delete [] pUTF16Buffer;
  856. }
  857. }
  858. }
  859. return bSuccess;
  860. }
  861. bool x_EndianSwapRequired( int nDocFlags )
  862. {
  863. short nWord = 1;
  864. char cFirstByte = ((char*)&nWord)[0];
  865. if ( cFirstByte ) // LE
  866. {
  867. if ( nDocFlags & CMarkup::MDF_UTF16BEFILE )
  868. return true;
  869. }
  870. else if ( nDocFlags & CMarkup::MDF_UTF16LEFILE )
  871. return true;
  872. return false;
  873. }
  874. void x_EndianSwapUTF16( unsigned short* pBuffer, int nCharLen )
  875. {
  876. unsigned short cChar;
  877. while ( nCharLen-- )
  878. {
  879. cChar = pBuffer[nCharLen];
  880. pBuffer[nCharLen] = (unsigned short)((cChar<<8) | (cChar>>8));
  881. }
  882. }
  883. //////////////////////////////////////////////////////////////////////
  884. // Element position indexes
  885. // This is the primary means of storing the layout of the document
  886. //
  887. struct ElemPos
  888. {
  889. ElemPos() {};
  890. ElemPos( const ElemPos& pos ) { *this = pos; };
  891. int StartTagLen() const { return nStartTagLen; };
  892. void SetStartTagLen( int n ) { nStartTagLen = n; };
  893. void AdjustStartTagLen( int n ) { nStartTagLen += n; };
  894. int EndTagLen() const { return nEndTagLen; };
  895. void SetEndTagLen( int n ) { nEndTagLen = n; };
  896. bool IsEmptyElement() { return (StartTagLen()==nLength)?true:false; };
  897. int StartContent() const { return nStart + StartTagLen(); };
  898. int ContentLen() const { return nLength - StartTagLen() - EndTagLen(); };
  899. int StartAfter() const { return nStart + nLength; };
  900. int Level() const { return nFlags & 0xffff; };
  901. void SetLevel( int nLev ) { nFlags = (nFlags & ~0xffff) | nLev; };
  902. void ClearVirtualParent() { memset(this,0,sizeof(ElemPos)); };
  903. void SetEndTagLenUnparsed() { SetEndTagLen(1); };
  904. bool IsUnparsed() { return EndTagLen() == 1; };
  905. // Memory size: 8 32-bit integers == 32 bytes
  906. int nStart;
  907. int nLength;
  908. unsigned int nStartTagLen : 22; // 4MB limit for start tag
  909. unsigned int nEndTagLen : 10; // 1K limit for end tag
  910. int nFlags; // 16 bits flags, 16 bits level 65536 depth limit
  911. int iElemParent;
  912. int iElemChild; // first child
  913. int iElemNext; // next sibling
  914. int iElemPrev; // if this is first, iElemPrev points to last
  915. };
  916. enum MarkupNodeFlagsInternal2
  917. {
  918. MNF_REPLACE = 0x001000,
  919. MNF_QUOTED = 0x008000,
  920. MNF_EMPTY = 0x010000,
  921. MNF_DELETED = 0x020000,
  922. MNF_FIRST = 0x080000,
  923. MNF_PUBLIC = 0x300000,
  924. MNF_ILLFORMED = 0x800000,
  925. MNF_USER = 0xf000000
  926. };
  927. struct ElemPosTree
  928. {
  929. ElemPosTree() { Clear(); };
  930. ~ElemPosTree() { Release(); };
  931. enum { PA_SEGBITS = 16, PA_SEGMASK = 0xffff };
  932. void ReleaseElemPosTree() { Release(); Clear(); };
  933. void Release() ;
  934. // {
  935. // for (int n=0;n<SegsUsed();++n)
  936. // delete[] (char*)m_pSegs[n];
  937. // if (m_pSegs)
  938. // delete[] (char*)m_pSegs;
  939. // };
  940. void Clear() { m_nSegs=0; m_nSize=0; /*if( m_pSegs != NULL ) delete [](char*)m_pSegs; */m_pSegs=NULL; };
  941. int GetSize() const { return m_nSize; };
  942. int SegsUsed() const { return ((m_nSize-1)>>PA_SEGBITS) + 1; };
  943. ElemPos& GetRefElemPosAt(int i) const { return m_pSegs[i>>PA_SEGBITS][i&PA_SEGMASK]; };
  944. void CopyElemPosTree( ElemPosTree* pOtherTree, int n );
  945. void GrowElemPosTree( int nNewSize );
  946. private:
  947. ElemPos** m_pSegs;
  948. int m_nSize;
  949. int m_nSegs;
  950. };
  951. void ElemPosTree::Release()
  952. {
  953. for (int n=0;n<SegsUsed();++n)
  954. delete[] (char*)m_pSegs[n];
  955. if (m_pSegs)
  956. delete[] (char*)m_pSegs;
  957. };
  958. void ElemPosTree::CopyElemPosTree( ElemPosTree* pOtherTree, int n )
  959. {
  960. ReleaseElemPosTree();
  961. m_nSize = n;
  962. if ( m_nSize < 8 )
  963. m_nSize = 8;
  964. m_nSegs = SegsUsed();
  965. if ( m_nSegs )
  966. {
  967. m_pSegs = (ElemPos**)(new char[m_nSegs*sizeof(char*)]);
  968. int nSegSize = 1 << PA_SEGBITS;
  969. for ( int nSeg=0; nSeg < m_nSegs; ++nSeg )
  970. {
  971. if ( nSeg + 1 == m_nSegs )
  972. nSegSize = m_nSize - (nSeg << PA_SEGBITS);
  973. m_pSegs[nSeg] = (ElemPos*)(new char[nSegSize*sizeof(ElemPos)]);
  974. memcpy( m_pSegs[nSeg], pOtherTree->m_pSegs[nSeg], nSegSize*sizeof(ElemPos) );
  975. }
  976. }
  977. }
  978. void ElemPosTree::GrowElemPosTree( int nNewSize )
  979. {
  980. // Called by x_AllocElemPos when the document is created or the array is filled
  981. // The ElemPosTree class is implemented using segments to reduce contiguous memory requirements
  982. // It reduces reallocations (copying of memory) since this only occurs within one segment
  983. // The "Grow By" algorithm ensures there are no reallocations after 2 segments
  984. //
  985. // Grow By: new size can be at most one more complete segment
  986. int nSeg = (m_nSize?m_nSize-1:0) >> PA_SEGBITS;
  987. int nNewSeg = (nNewSize-1) >> PA_SEGBITS;
  988. if ( nNewSeg > nSeg + 1 )
  989. {
  990. nNewSeg = nSeg + 1;
  991. nNewSize = (nNewSeg+1) << PA_SEGBITS;
  992. }
  993. // Allocate array of segments
  994. if ( m_nSegs <= nNewSeg )
  995. {
  996. int nNewSegments = 4 + nNewSeg * 2;
  997. char* pNewSegments = new char[nNewSegments*sizeof(char*)];
  998. if ( SegsUsed() )
  999. memcpy( pNewSegments, m_pSegs, SegsUsed()*sizeof(char*) );
  1000. if ( m_pSegs )
  1001. delete[] (char*)m_pSegs;
  1002. m_pSegs = (ElemPos**)pNewSegments;
  1003. m_nSegs = nNewSegments;
  1004. }
  1005. // Calculate segment sizes
  1006. int nSegSize = m_nSize - (nSeg << PA_SEGBITS);
  1007. int nNewSegSize = nNewSize - (nNewSeg << PA_SEGBITS);
  1008. // Complete first segment
  1009. int nFullSegSize = 1 << PA_SEGBITS;
  1010. if ( nSeg < nNewSeg && nSegSize < nFullSegSize )
  1011. {
  1012. char* pNewFirstSeg = new char[ nFullSegSize * sizeof(ElemPos) ];
  1013. if ( nSegSize )
  1014. {
  1015. // Reallocate
  1016. memcpy( pNewFirstSeg, m_pSegs[nSeg], nSegSize * sizeof(ElemPos) );
  1017. delete[] (char*)m_pSegs[nSeg];
  1018. }
  1019. m_pSegs[nSeg] = (ElemPos*)pNewFirstSeg;
  1020. }
  1021. // New segment
  1022. char* pNewSeg = new char[ nNewSegSize * sizeof(ElemPos) ];
  1023. if ( nNewSeg == nSeg && nSegSize )
  1024. {
  1025. // Reallocate
  1026. memcpy( pNewSeg, m_pSegs[nSeg], nSegSize * sizeof(ElemPos) );
  1027. delete[] (char*)m_pSegs[nSeg];
  1028. }
  1029. m_pSegs[nNewSeg] = (ElemPos*)pNewSeg;
  1030. m_nSize = nNewSize;
  1031. }
  1032. #define ELEM(i) m_pElemPosTree->GetRefElemPosAt(i)
  1033. //////////////////////////////////////////////////////////////////////
  1034. // NodePos stores information about an element or node during document creation and parsing
  1035. //
  1036. struct NodePos
  1037. {
  1038. NodePos() {};
  1039. NodePos( int n ) { nNodeFlags=n; nNodeType=0; nStart=0; nLength=0; };
  1040. int nNodeType;
  1041. int nStart;
  1042. int nLength;
  1043. int nNodeFlags;
  1044. MCD_STR strMeta;
  1045. };
  1046. //////////////////////////////////////////////////////////////////////
  1047. // "Is Char" defines
  1048. // Quickly determine if a character matches a limited set
  1049. //
  1050. #define x_ISONEOF(c,f,l,s) ((c>=f&&c<=l)?(int)(s[c-f]):0)
  1051. // classic whitespace " \t\n\r"
  1052. #define x_ISWHITESPACE(c) x_ISONEOF(c,9,32,"\2\3\0\0\4\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1")
  1053. // end of word in a path " =/[]"
  1054. #define x_ISENDPATHWORD(c) x_ISONEOF(c,32,93,"\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\3\0\0\0\0\0\0\0\0\0\0\0\0\0\2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\4\0\5")
  1055. // end of a name " \t\n\r/>"
  1056. #define x_ISENDNAME(c) x_ISONEOF(c,9,62,"\2\3\0\0\4\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1")
  1057. // a small set of chars cannot be second last in attribute value " \t\n\r\"\'"
  1058. #define x_ISNOTSECONDLASTINVAL(c) x_ISONEOF(c,9,39,"\2\3\0\0\4\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\5\0\0\0\0\1")
  1059. // first char of doc type tag name "EAN"
  1060. #define x_ISDOCTYPESTART(c) x_ISONEOF(c,65,78,"\2\0\0\0\1\0\0\0\0\0\0\0\0\3")
  1061. // attrib special char "<&>\"\'"
  1062. #define x_ISATTRIBSPECIAL(c) x_ISONEOF(c,34,62,"\4\0\0\0\2\5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\3")
  1063. // parsed text special char "<&>"
  1064. #define x_ISSPECIAL(c) x_ISONEOF(c,38,62,"\2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\3")
  1065. // end of any name " \t\n\r<>=\\/?!\"';"
  1066. #define x_ISENDANYNAME(c) x_ISONEOF(c,9,92,"\2\3\0\0\4\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\1\1\0\0\0\0\1\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\1\5\1\1\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1")
  1067. // end of unquoted attrib value " \t\n\r>"
  1068. #define x_ISENDUNQUOTED(c) x_ISONEOF(c,9,62,"\2\3\0\0\4\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\5")
  1069. // end of attrib name "= \t\n\r>/?"
  1070. #define x_ISENDATTRIBNAME(c) x_ISONEOF(c,9,63,"\3\4\0\0\5\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\2\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\1\1\1")
  1071. // start of entity reference "A-Za-Z#_:"
  1072. #define x_ISSTARTENTREF(c) x_ISONEOF(c,35,122,"\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\1\0\0\0\0\0\0\1\2\3\4\5\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\1\0\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1")
  1073. // within entity reference "A-Za-Z0-9_:-."
  1074. #define x_ISINENTREF(c) x_ISONEOF(c,45,122,"\1\1\0\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\0\0\1\2\3\4\5\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\0\0\0\0\1\0\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1\1")
  1075. //////////////////////////////////////////////////////////////////////
  1076. // Token struct and tokenizing functions
  1077. // TokenPos handles parsing operations on a constant text pointer
  1078. //
  1079. struct TokenPos
  1080. {
  1081. TokenPos( MCD_CSTR sz, int n, FilePos* p=NULL ) { Clear(); m_pDocText=sz; m_nTokenFlags=n; m_pReaderFilePos=p; };
  1082. void Clear() { m_nL=0; m_nR=-1; m_nNext=0; };
  1083. int Length() const { return m_nR - m_nL + 1; };
  1084. MCD_PCSZ GetTokenPtr() const { return &m_pDocText[m_nL]; };
  1085. MCD_STR GetTokenText() const { return MCD_STR( GetTokenPtr(), Length() ); };
  1086. MCD_CHAR NextChar() { m_nNext += MCD_CLEN(&m_pDocText[m_nNext]); return m_pDocText[m_nNext]; };
  1087. int WhitespaceToTag( int n ) { m_nNext = n; if (FindAny()&&m_pDocText[m_nNext]!='<') { m_nNext=n; m_nR=n-1; } return m_nNext; };
  1088. bool FindAny()
  1089. {
  1090. // Go to non-whitespace or end
  1091. MCD_CHAR cNext = m_pDocText[m_nNext];
  1092. while ( cNext && x_ISWHITESPACE(cNext) )
  1093. cNext = m_pDocText[++m_nNext];
  1094. m_nL = m_nNext;
  1095. m_nR = m_nNext-1;
  1096. return m_pDocText[m_nNext]!='\0';
  1097. };
  1098. bool FindName()
  1099. {
  1100. if ( ! FindAny() ) // go to first non-whitespace
  1101. return false;
  1102. MCD_CHAR cNext = m_pDocText[m_nNext];
  1103. while ( cNext && ! x_ISENDANYNAME(cNext) )
  1104. cNext = NextChar();
  1105. if ( m_nNext == m_nL )
  1106. ++m_nNext; // it is a special char
  1107. m_nR = m_nNext - 1;
  1108. return true;
  1109. }
  1110. bool Match( MCD_CSTR szName )
  1111. {
  1112. int nLen = Length();
  1113. return ( (x_StrNCmp( GetTokenPtr(), szName, nLen, m_nTokenFlags & CMarkup::MDF_IGNORECASE ) == 0)
  1114. && ( szName[nLen] == '\0' || x_ISENDPATHWORD(szName[nLen]) ) );
  1115. };
  1116. bool FindAttrib( MCD_PCSZ pAttrib, int n = 0, MCD_STR* pstrAttrib = NULL );
  1117. int ParseNode( NodePos& node );
  1118. int m_nL;
  1119. int m_nR;
  1120. int m_nNext;
  1121. MCD_PCSZ m_pDocText;
  1122. int m_nTokenFlags;
  1123. int m_nPreSpaceStart;
  1124. int m_nPreSpaceLength;
  1125. FilePos* m_pReaderFilePos;
  1126. };
  1127. bool TokenPos::FindAttrib( MCD_PCSZ pAttrib, int n/*=0*/, MCD_STR* pstrAttrib/*=NULL*/ )
  1128. {
  1129. // Return true if found, otherwise false and token.m_nNext is new insertion point
  1130. // If pAttrib is NULL find attrib n and leave token at attrib name
  1131. // If pAttrib is given, find matching attrib and leave token at value
  1132. // support non-well-formed attributes e.g. href=/advanced_search?hl=en, nowrap
  1133. // token also holds start and length of preceeding whitespace to support remove
  1134. //
  1135. int nTempPreSpaceStart;
  1136. int nTempPreSpaceLength;
  1137. MCD_CHAR cFirstChar, cNext;
  1138. int nAttrib = -1; // starts at tag name
  1139. int nFoundAttribNameR = 0;
  1140. bool bAfterEqual = false;
  1141. while ( 1 )
  1142. {
  1143. // Starting at m_nNext, bypass whitespace and find the next token
  1144. nTempPreSpaceStart = m_nNext;
  1145. if ( ! FindAny() )
  1146. break;
  1147. nTempPreSpaceLength = m_nNext - nTempPreSpaceStart;
  1148. // Is it an opening quote?
  1149. cFirstChar = m_pDocText[m_nNext];
  1150. if ( cFirstChar == '\"' || cFirstChar == '\'' )
  1151. {
  1152. m_nTokenFlags |= MNF_QUOTED;
  1153. // Move past opening quote
  1154. ++m_nNext;
  1155. m_nL = m_nNext;
  1156. // Look for closing quote
  1157. cNext = m_pDocText[m_nNext];
  1158. while ( cNext && cNext != cFirstChar )
  1159. cNext = NextChar();
  1160. // Set right to before closing quote
  1161. m_nR = m_nNext - 1;
  1162. // Set m_nNext past closing quote unless at end of document
  1163. if ( cNext )
  1164. ++m_nNext;
  1165. }
  1166. else
  1167. {
  1168. m_nTokenFlags &= ~MNF_QUOTED;
  1169. // Go until special char or whitespace
  1170. m_nL = m_nNext;
  1171. cNext = m_pDocText[m_nNext];
  1172. if ( bAfterEqual )
  1173. {
  1174. while ( cNext && ! x_ISENDUNQUOTED(cNext) )
  1175. cNext = NextChar();
  1176. }
  1177. else
  1178. {
  1179. while ( cNext && ! x_ISENDATTRIBNAME(cNext) )
  1180. cNext = NextChar();
  1181. }
  1182. // Adjust end position if it is one special char
  1183. if ( m_nNext == m_nL )
  1184. ++m_nNext; // it is a special char
  1185. m_nR = m_nNext - 1;
  1186. }
  1187. if ( ! bAfterEqual && ! (m_nTokenFlags&MNF_QUOTED) )
  1188. {
  1189. // Is it an equal sign?
  1190. MCD_CHAR cChar = m_pDocText[m_nL];
  1191. if ( cChar == '=' )
  1192. {
  1193. bAfterEqual = true;
  1194. continue;
  1195. }
  1196. // Is it the end of the tag?
  1197. if ( cChar == '>' || cChar == '/' || cChar == '?' )
  1198. {
  1199. m_nNext = nTempPreSpaceStart;
  1200. break; // attrib not found
  1201. }
  1202. if ( nFoundAttribNameR )
  1203. break;
  1204. // Attribute name
  1205. if ( nAttrib != -1 )
  1206. {
  1207. if ( ! pAttrib )
  1208. {
  1209. if ( nAttrib == n )
  1210. {
  1211. // found by number
  1212. if ( pstrAttrib )
  1213. {
  1214. *pstrAttrib = GetTokenText();
  1215. nFoundAttribNameR = m_nR;
  1216. }
  1217. else
  1218. return true;
  1219. }
  1220. }
  1221. else if ( Match(pAttrib) )
  1222. {
  1223. // Matched attrib name, go forward to value
  1224. nFoundAttribNameR = m_nR;
  1225. }
  1226. if ( nFoundAttribNameR ) // either by n or name match
  1227. {
  1228. m_nPreSpaceStart = nTempPreSpaceStart;
  1229. m_nPreSpaceLength = nTempPreSpaceLength;
  1230. }
  1231. }
  1232. ++nAttrib;
  1233. }
  1234. else if ( nFoundAttribNameR )
  1235. break;
  1236. bAfterEqual = false;
  1237. }
  1238. if ( nFoundAttribNameR )
  1239. {
  1240. if ( ! bAfterEqual )
  1241. {
  1242. // when attribute has no value the value is the attribute name
  1243. m_nL = m_nPreSpaceStart + m_nPreSpaceLength;
  1244. m_nR = nFoundAttribNameR;
  1245. m_nNext = nFoundAttribNameR + 1;
  1246. }
  1247. return true; // found by name
  1248. }
  1249. return false; // not found
  1250. }
  1251. //////////////////////////////////////////////////////////////////////
  1252. // Element tag stack: an array of TagPos structs to track nested elements
  1253. // This is used during parsing to match end tags with corresponding start tags
  1254. // For x_ParseElem only ElemStack::iTop is used with PushIntoLevel, PopOutOfLevel, and Current
  1255. // For file mode then the full capabilities are used to track counts of sibling tag names for path support
  1256. //
  1257. struct TagPos
  1258. {
  1259. TagPos() { Init(); };
  1260. void SetTagName( MCD_PCSZ pName, int n ) { MCD_STRASSIGN(strTagName,pName,n); };
  1261. void Init( int i=0, int n=1 ) { nCount=1; nTagNames=n; iNext=i; iPrev=0; nSlot=-1; iSlotPrev=0; iSlotNext=0; };
  1262. void IncCount() { if (nCount) ++nCount; };
  1263. MCD_STR strTagName;
  1264. int nCount;
  1265. int nTagNames;
  1266. int iParent;
  1267. int iNext;
  1268. int iPrev;
  1269. int nSlot;
  1270. int iSlotNext;
  1271. int iSlotPrev;
  1272. };
  1273. struct ElemStack
  1274. {
  1275. enum { LS_TABLESIZE = 23 };
  1276. ElemStack() { iTop=0; iUsed=0; iPar=0; nLevel=0; nSize=0; pL=NULL; Alloc(7); pL[0].Init(); InitTable(); };
  1277. ~ElemStack() { if (pL) delete [] pL; };
  1278. TagPos& Current() { return pL[iTop]; };
  1279. void InitTable() { memset(anTable,0,sizeof(int)*LS_TABLESIZE); };
  1280. TagPos& NextParent( int& i ) { int iCur=i; i=pL[i].iParent; return pL[iCur]; };
  1281. TagPos& GetRefTagPosAt( int i ) { return pL[i]; };
  1282. void Push( MCD_PCSZ pName, int n ) { ++iUsed; if (iUsed==nSize) Alloc(nSize*2); pL[iUsed].SetTagName(pName,n); pL[iUsed].iParent=iPar; iTop=iUsed; };
  1283. void IntoLevel() { iPar = iTop; ++nLevel; };
  1284. void OutOfLevel() { if (iPar!=iTop) Pop(); iPar = pL[iTop].iParent; --nLevel; };
  1285. void PushIntoLevel( MCD_PCSZ pName, int n ) { ++iTop; if (iTop==nSize) Alloc(nSize*2); pL[iTop].SetTagName(pName,n); };
  1286. void PopOutOfLevel() { --iTop; };
  1287. void Pop() { iTop = iPar; while (iUsed && pL[iUsed].iParent==iPar) { if (pL[iUsed].nSlot!=-1) Unslot(pL[iUsed]); --iUsed; } };
  1288. void Slot( int n ) { pL[iUsed].nSlot=n; int i=anTable[n]; anTable[n]=iUsed; pL[iUsed].iSlotNext=i; if (i) pL[i].iSlotPrev=iUsed; };
  1289. void Unslot( TagPos& lp ) { int n=lp.iSlotNext,p=lp.iSlotPrev; if (n) pL[n].iSlotPrev=p; if (p) pL[p].iSlotNext=n; else anTable[lp.nSlot]=n; };
  1290. static int CalcSlot( MCD_PCSZ pName, int n, bool bIC );
  1291. void PushTagAndCount( TokenPos& token );
  1292. int iTop;
  1293. int nLevel;
  1294. int iPar;
  1295. protected:
  1296. void Alloc( int nNewSize ) { TagPos* pLNew = new TagPos[nNewSize]; Copy(pLNew); nSize=nNewSize; };
  1297. void Copy( TagPos* pLNew ) { for(int n=0;n<nSize;++n) pLNew[n]=pL[n]; if (pL) delete [] pL; pL=pLNew; };
  1298. TagPos* pL;
  1299. int iUsed;
  1300. int nSize;
  1301. int anTable[LS_TABLESIZE];
  1302. };
  1303. int ElemStack::CalcSlot( MCD_PCSZ pName, int n, bool bIC )
  1304. {
  1305. // If bIC (ASCII ignore case) then return an ASCII case insensitive hash
  1306. unsigned int nHash = 0;
  1307. MCD_PCSZ pEnd = pName + n;
  1308. while ( pName != pEnd )
  1309. {
  1310. nHash += (unsigned int)(*pName);
  1311. if ( bIC && *pName >= 'A' && *pName <= 'Z' )
  1312. nHash += ('a'-'A');
  1313. ++pName;
  1314. }
  1315. return nHash%LS_TABLESIZE;
  1316. }
  1317. void ElemStack::PushTagAndCount( TokenPos& token )
  1318. {
  1319. // Check for a matching tag name at the top level and set current if found or add new one
  1320. // Calculate hash of tag name, support ignore ASCII case for MDF_IGNORECASE
  1321. int nSlot = -1;
  1322. int iNext = 0;
  1323. MCD_PCSZ pTagName = token.GetTokenPtr();
  1324. if ( iTop != iPar )
  1325. {
  1326. // See if tag name is already used, first try previous sibling (almost always)
  1327. iNext = iTop;
  1328. if ( token.Match(Current().strTagName) )
  1329. {
  1330. iNext = -1;
  1331. Current().IncCount();
  1332. }
  1333. else
  1334. {
  1335. nSlot = CalcSlot( pTagName, token.Length(), (token.m_nTokenFlags & CMarkup::MDF_IGNORECASE)?true:false );
  1336. int iLookup = anTable[nSlot];
  1337. while ( iLookup )
  1338. {
  1339. TagPos& tag = pL[iLookup];
  1340. if ( tag.iParent == iPar && token.Match(tag.strTagName) )
  1341. {
  1342. pL[tag.iPrev].iNext = tag.iNext;
  1343. if ( tag.iNext )
  1344. pL[tag.iNext].iPrev = tag.iPrev;
  1345. tag.nTagNames = Current().nTagNames;
  1346. tag.iNext = iTop;
  1347. tag.IncCount();
  1348. iTop = iLookup;
  1349. iNext = -1;
  1350. break;
  1351. }
  1352. iLookup = tag.iSlotNext;
  1353. }
  1354. }
  1355. }
  1356. if ( iNext != -1 )
  1357. {
  1358. // Turn off in the rare case where a document uses unique tag names like record1, record2, etc, more than 256
  1359. int nTagNames = 0;
  1360. if ( iNext )
  1361. nTagNames = Current().nTagNames;
  1362. if ( nTagNames == 256 )
  1363. {
  1364. MCD_STRASSIGN( (Current().strTagName), pTagName, (token.Length()) );
  1365. Current().nCount = 0;
  1366. Unslot( Current() );
  1367. }
  1368. else
  1369. {
  1370. Push( pTagName, token.Length() );
  1371. Current().Init( iNext, nTagNames+1 );
  1372. }
  1373. if ( nSlot == -1 )
  1374. nSlot = CalcSlot( pTagName, token.Length(), (token.m_nTokenFlags & CMarkup::MDF_IGNORECASE)?true:false );
  1375. Slot( nSlot );
  1376. }
  1377. }
  1378. //////////////////////////////////////////////////////////////////////
  1379. // FilePos is created for a file while it is open
  1380. // In file mode the file stays open between CMarkup calls and is stored in m_pFilePos
  1381. //
  1382. struct FilePos
  1383. {
  1384. FilePos()
  1385. {
  1386. m_fp=NULL; m_nDocFlags=0; m_nFileByteLen=0; m_nFileByteOffset=0; m_nOpFileByteLen=0; m_nBlockSizeBasis=MARKUP_FILEBLOCKSIZE;
  1387. m_nFileCharUnitSize=0; m_nOpFileTextLen=0; m_pstrBuffer=NULL; m_nReadBufferStart=0; m_nReadBufferRemoved=0; m_nReadGatherStart=-1;
  1388. };
  1389. bool FileOpen( MCD_CSTR_FILENAME szFileName );
  1390. bool FileRead( void* pBuffer );
  1391. bool FileReadText( MCD_STR& strDoc );
  1392. bool FileCheckRaggedEnd( void* pBuffer );
  1393. bool FileReadNextBuffer();
  1394. void FileGatherStart( int nStart );
  1395. int FileGatherEnd( MCD_STR& strSubDoc );
  1396. bool FileWrite( void* pBuffer, const void* pConstBuffer = NULL );
  1397. bool FileWriteText( const MCD_STR& strDoc, int nWriteStrLen = -1 );
  1398. bool FileFlush( MCD_STR& strBuffer, int nWriteStrLen = -1, bool bFflush = false );
  1399. bool FileClose();
  1400. void FileSpecifyEncoding( MCD_STR* pstrEncoding );
  1401. bool FileAtTop();
  1402. bool FileErrorAddResult();
  1403. FILE* m_fp;
  1404. int m_nDocFlags;
  1405. int m_nOpFileByteLen;
  1406. int m_nBlockSizeBasis;
  1407. MCD_INTFILEOFFSET m_nFileByteLen;
  1408. MCD_INTFILEOFFSET m_nFileByteOffset;
  1409. int m_nFileCharUnitSize;
  1410. int m_nOpFileTextLen;
  1411. MCD_STR m_strIOResult;
  1412. MCD_STR m_strEncoding;
  1413. MCD_STR* m_pstrBuffer;
  1414. ElemStack m_elemstack;
  1415. int m_nReadBufferStart;
  1416. int m_nReadBufferRemoved;
  1417. int m_nReadGatherStart;
  1418. MCD_STR m_strReadGatherMarkup;
  1419. };
  1420. struct BomTableStruct { const char* pszBom; int nBomLen; MCD_PCSZ pszBomEnc; int nBomFlag; } BomTable[] =
  1421. {
  1422. { "\xef\xbb\xbf", 3, MCD_T("UTF-8"), CMarkup::MDF_UTF8PREAMBLE },
  1423. { "\xff\xfe", 2, MCD_T("UTF-16LE"), CMarkup::MDF_UTF16LEFILE },
  1424. { "\xfe\xff", 2, MCD_T("UTF-16BE"), CMarkup::MDF_UTF16BEFILE },
  1425. { NULL,0,NULL,0 }
  1426. };
  1427. bool FilePos::FileErrorAddResult()
  1428. {
  1429. // strerror has difficulties cross-platform
  1430. // VC++ leaves MCD_STRERROR undefined and uses FormatMessage
  1431. // Non-VC++ use strerror (even for MARKUP_WCHAR and convert)
  1432. // additional notes:
  1433. // _WIN32_WCE (Windows CE) has no strerror (Embedded VC++ uses FormatMessage)
  1434. // _MSC_VER >= 1310 (VC++ 2003/7.1) has _wcserror (but not used)
  1435. //
  1436. const int nErrorBufferSize = 100;
  1437. int nErr = 0;
  1438. MCD_CHAR szError[nErrorBufferSize+1];
  1439. #if defined(MCD_STRERROR) // C error routine
  1440. nErr = (int)errno;
  1441. #if defined(MARKUP_WCHAR)
  1442. char szMBError[nErrorBufferSize+1];
  1443. strncpy( szMBError, MCD_STRERROR, nErrorBufferSize );
  1444. szMBError[nErrorBufferSize] = '\0';
  1445. TextEncoding textencoding( MCD_T(""), (const void*)szMBError, strlen(szMBError) );
  1446. textencoding.m_nToCount = nErrorBufferSize;
  1447. int nWideLen = textencoding.PerformConversion( (void*)szError, MCD_ENC );
  1448. szError[nWideLen] = '\0';
  1449. #else
  1450. MCD_PSZNCPY( szError, MCD_STRERROR, nErrorBufferSize );
  1451. szError[nErrorBufferSize] = '\0';
  1452. #endif
  1453. #else // no C error routine, use Windows API
  1454. DWORD dwErr = ::GetLastError();
  1455. if ( ::FormatMessage(0x1200,0,dwErr,0,szError,nErrorBufferSize,0) < 1 )
  1456. szError[0] = '\0';
  1457. nErr = (int)dwErr;
  1458. #endif // no C error routine
  1459. MCD_STR strError = szError;
  1460. for ( int nChar=0; nChar<MCD_STRLENGTH(strError); ++nChar )
  1461. if ( strError[nChar] == '\r' || strError[nChar] == '\n' )
  1462. {
  1463. strError = MCD_STRMID( strError, 0, nChar ); // no trailing newline
  1464. break;
  1465. }
  1466. x_AddResult( m_strIOResult, MCD_T("file_error"), strError, MRC_MSG|MRC_NUMBER, nErr );
  1467. return false;
  1468. }
  1469. void FilePos::FileSpecifyEncoding( MCD_STR* pstrEncoding )
  1470. {
  1471. // In ReadTextFile, WriteTextFile and Open, the pstrEncoding argument can override or return the detected encoding
  1472. if ( pstrEncoding && m_strEncoding != *pstrEncoding )
  1473. {
  1474. if ( m_nFileCharUnitSize == 1 && *pstrEncoding != MCD_T("") )
  1475. m_strEncoding = *pstrEncoding; // override the encoding
  1476. else // just report the encoding
  1477. *pstrEncoding = m_strEncoding;
  1478. }
  1479. }
  1480. bool FilePos::FileAtTop()
  1481. {
  1482. // Return true if in the first block of file mode, max BOM < 5 bytes
  1483. if ( ((m_nDocFlags & CMarkup::MDF_READFILE) && m_nFileByteOffset < (MCD_INTFILEOFFSET)m_nOpFileByteLen + 5 )
  1484. || ((m_nDocFlags & CMarkup::MDF_WRITEFILE) && m_nFileByteOffset < 5) )
  1485. return true;
  1486. return false;
  1487. }
  1488. bool FilePos::FileOpen( MCD_CSTR_FILENAME szFileName )
  1489. {
  1490. MCD_STRCLEAR( m_strIOResult );
  1491. // Open file
  1492. MCD_PCSZ_FILENAME pMode = MCD_T_FILENAME("rb");
  1493. if ( m_nDocFlags & CMarkup::MDF_APPENDFILE )
  1494. pMode = MCD_T_FILENAME("ab");
  1495. else if ( m_nDocFlags & CMarkup::MDF_WRITEFILE )
  1496. pMode = MCD_T_FILENAME("wb");
  1497. m_fp = NULL;
  1498. MCD_FOPEN( m_fp, szFileName, pMode );
  1499. if ( ! m_fp )
  1500. return FileErrorAddResult();
  1501. // Prepare file
  1502. bool bSuccess = true;
  1503. int nBomLen = 0;
  1504. m_nFileCharUnitSize = 1; // unless UTF-16 BOM
  1505. if ( m_nDocFlags & CMarkup::MDF_READFILE )
  1506. {
  1507. // Get file length
  1508. MCD_FSEEK( m_fp, 0, SEEK_END );
  1509. m_nFileByteLen = MCD_FTELL( m_fp );
  1510. MCD_FSEEK( m_fp, 0, SEEK_SET );
  1511. // Read the top of the file to check BOM and encoding
  1512. int nReadTop = 1024;
  1513. if ( m_nFileByteLen < nReadTop )
  1514. nReadTop = (int)m_nFileByteLen;
  1515. if ( nReadTop )
  1516. {
  1517. char* pFileTop = new char[nReadTop];
  1518. if ( nReadTop )
  1519. bSuccess = ( fread( pFileTop, nReadTop, 1, m_fp ) == 1 );
  1520. if ( bSuccess )
  1521. {
  1522. // Check for Byte Order Mark (preamble)
  1523. int nBomCheck = 0;
  1524. m_nDocFlags &= ~( CMarkup::MDF_UTF16LEFILE | CMarkup::MDF_UTF8PREAMBLE );
  1525. while ( BomTable[nBomCheck].pszBom )
  1526. {
  1527. while ( nBomLen < BomTable[nBomCheck].nBomLen )
  1528. {
  1529. if ( nBomLen >= nReadTop || pFileTop[nBomLen] != BomTable[nBomCheck].pszBom[nBomLen] )
  1530. break;
  1531. ++nBomLen;
  1532. }
  1533. if ( nBomLen == BomTable[nBomCheck].nBomLen )
  1534. {
  1535. m_nDocFlags |= BomTable[nBomCheck].nBomFlag;
  1536. if ( nBomLen == 2 )
  1537. m_nFileCharUnitSize = 2;
  1538. m_strEncoding = BomTable[nBomCheck].pszBomEnc;
  1539. break;
  1540. }
  1541. ++nBomCheck;
  1542. nBomLen = 0;
  1543. }
  1544. if ( nReadTop > nBomLen )
  1545. MCD_FSEEK( m_fp, nBomLen, SEEK_SET );
  1546. // Encoding check
  1547. if ( ! nBomLen )
  1548. {
  1549. MCD_STR strDeclCheck;
  1550. #if defined(MARKUP_WCHAR) // WCHAR
  1551. TextEncoding textencoding( MCD_T("UTF-8"), (const void*)pFileTop, nReadTop );
  1552. MCD_CHAR* pWideBuffer = MCD_GETBUFFER(strDeclCheck,nReadTop);
  1553. textencoding.m_nToCount = nReadTop;
  1554. int nDeclWideLen = textencoding.PerformConversion( (void*)pWideBuffer, MCD_ENC );
  1555. MCD_RELEASEBUFFER(strDeclCheck,pWideBuffer,nDeclWideLen);
  1556. #else // not WCHAR
  1557. MCD_STRASSIGN(strDeclCheck,pFileTop,nReadTop);
  1558. #endif // not WCHAR
  1559. m_strEncoding = CMarkup::GetDeclaredEncoding( strDeclCheck );
  1560. }
  1561. // Assume markup files starting with < sign are UTF-8 if otherwise unknown
  1562. if ( MCD_STRISEMPTY(m_strEncoding) && pFileTop[0] == '<' )
  1563. m_strEncoding = MCD_T("UTF-8");
  1564. }
  1565. delete [] pFileTop;
  1566. }
  1567. }
  1568. else if ( m_nDocFlags & CMarkup::MDF_WRITEFILE )
  1569. {
  1570. if ( m_nDocFlags & CMarkup::MDF_APPENDFILE )
  1571. {
  1572. // fopen for append does not move the file pointer to the end until first I/O operation
  1573. MCD_FSEEK( m_fp, 0, SEEK_END );
  1574. m_nFileByteLen = MCD_FTELL( m_fp );
  1575. }
  1576. int nBomCheck = 0;
  1577. while ( BomTable[nBomCheck].pszBom )
  1578. {
  1579. if ( m_nDocFlags & BomTable[nBomCheck].nBomFlag )
  1580. {
  1581. nBomLen = BomTable[nBomCheck].nBomLen;
  1582. if ( nBomLen == 2 )
  1583. m_nFileCharUnitSize = 2;
  1584. m_strEncoding = BomTable[nBomCheck].pszBomEnc;
  1585. if ( m_nFileByteLen ) // append
  1586. nBomLen = 0;
  1587. else // write BOM
  1588. bSuccess = ( fwrite(BomTable[nBomCheck].pszBom,nBomLen,1,m_fp) == 1 );
  1589. break;
  1590. }
  1591. ++nBomCheck;
  1592. }
  1593. }
  1594. if ( ! bSuccess )
  1595. return FileErrorAddResult();
  1596. if ( m_nDocFlags & CMarkup::MDF_APPENDFILE )
  1597. m_nFileByteOffset = m_nFileByteLen;
  1598. else
  1599. m_nFileByteOffset = (MCD_INTFILEOFFSET)nBomLen;
  1600. if ( nBomLen )
  1601. x_AddResult( m_strIOResult, MCD_T("bom") );
  1602. return bSuccess;
  1603. }
  1604. bool FilePos::FileRead( void* pBuffer )
  1605. {
  1606. bool bSuccess = ( fread( pBuffer,m_nOpFileByteLen,1,m_fp) == 1 );
  1607. m_nOpFileTextLen = m_nOpFileByteLen / m_nFileCharUnitSize;
  1608. if ( bSuccess )
  1609. {
  1610. m_nFileByteOffset += m_nOpFileByteLen;
  1611. x_AddResult( m_strIOResult, MCD_T("read"), m_strEncoding, MRC_ENCODING|MRC_LENGTH, m_nOpFileTextLen );
  1612. // Microsoft components can produce apparently valid docs with some nulls at ends of values
  1613. int nNullCount = 0;
  1614. int nNullCheckCharsRemaining = m_nOpFileTextLen;
  1615. char* pAfterNull = NULL;
  1616. char* pNullScan = (char*)pBuffer;
  1617. bool bSingleByteChar = m_nFileCharUnitSize == 1;
  1618. while ( nNullCheckCharsRemaining-- )
  1619. {
  1620. if ( bSingleByteChar? (! *pNullScan) : (! (*(unsigned short*)pNullScan)) )
  1621. {
  1622. if ( pAfterNull && pNullScan != pAfterNull )
  1623. memmove( pAfterNull - (nNullCount*m_nFileCharUnitSize), pAfterNull, pNullScan - pAfterNull );
  1624. pAfterNull = pNullScan + m_nFileCharUnitSize;
  1625. ++nNullCount;
  1626. }
  1627. pNullScan += m_nFileCharUnitSize;
  1628. }
  1629. if ( pAfterNull && pNullScan != pAfterNull )
  1630. memmove( pAfterNull - (nNullCount*m_nFileCharUnitSize), pAfterNull, pNullScan - pAfterNull );
  1631. if ( nNullCount )
  1632. {
  1633. x_AddResult( m_strIOResult, MCD_T("nulls_removed"), NULL, MRC_COUNT, nNullCount );
  1634. m_nOpFileTextLen -= nNullCount;
  1635. }
  1636. // Big endian/little endian conversion
  1637. if ( m_nFileCharUnitSize > 1 && x_EndianSwapRequired(m_nDocFlags) )
  1638. {
  1639. x_EndianSwapUTF16( (unsigned short*)pBuffer, m_nOpFileTextLen );
  1640. x_AddResult( m_strIOResult, MCD_T("endian_swap") );
  1641. }
  1642. }
  1643. if ( ! bSuccess )
  1644. FileErrorAddResult();
  1645. return bSuccess;
  1646. }
  1647. bool FilePos::FileCheckRaggedEnd( void* pBuffer )
  1648. {
  1649. // In file read mode, piece of file text in memory must end on a character boundary
  1650. // This check must happen after the encoding has been decided, so after UTF-8 autodetection
  1651. // If ragged, adjust file position, m_nOpFileTextLen and m_nOpFileByteLen
  1652. int nTruncBeforeBytes = 0;
  1653. TextEncoding textencoding( m_strEncoding, pBuffer, m_nOpFileTextLen );
  1654. if ( ! textencoding.FindRaggedEnd(nTruncBeforeBytes) )
  1655. {
  1656. // Input must be garbled? decoding error before potentially ragged end, add error result and continue
  1657. MCD_STR strEncoding = m_strEncoding;
  1658. if ( MCD_STRISEMPTY(strEncoding) )
  1659. strEncoding = MCD_T("ANSI");
  1660. x_AddResult( m_strIOResult, MCD_T("truncation_error"), strEncoding, MRC_ENCODING );
  1661. }
  1662. else if ( nTruncBeforeBytes )
  1663. {
  1664. nTruncBeforeBytes *= -1;
  1665. m_nFileByteOffset += nTruncBeforeBytes;
  1666. MCD_FSEEK( m_fp, m_nFileByteOffset, SEEK_SET );
  1667. m_nOpFileByteLen += nTruncBeforeBytes;
  1668. m_nOpFileTextLen += nTruncBeforeBytes / m_nFileCharUnitSize;
  1669. x_AddResult( m_strIOResult, MCD_T("read"), NULL, MRC_MODIFY|MRC_LENGTH, m_nOpFileTextLen );
  1670. }
  1671. return true;
  1672. }
  1673. bool FilePos::FileReadText( MCD_STR& strDoc )
  1674. {
  1675. bool bSuccess = true;
  1676. MCD_STRCLEAR( m_strIOResult );
  1677. if ( ! m_nOpFileByteLen )
  1678. {
  1679. x_AddResult( m_strIOResult, MCD_T("read"), m_strEncoding, MRC_ENCODING|MRC_LENGTH, 0 );
  1680. return bSuccess;
  1681. }
  1682. // Only read up to end of file (a single read byte length cannot be over the capacity of int)
  1683. bool bCheckRaggedEnd = true;
  1684. MCD_INTFILEOFFSET nBytesRemaining = m_nFileByteLen - m_nFileByteOffset;
  1685. if ( (MCD_INTFILEOFFSET)m_nOpFileByteLen >= nBytesRemaining )
  1686. {
  1687. m_nOpFileByteLen = (int)nBytesRemaining;
  1688. bCheckRaggedEnd = false;
  1689. }
  1690. if ( m_nDocFlags & (CMarkup::MDF_UTF16LEFILE | CMarkup::MDF_UTF16BEFILE) )
  1691. {
  1692. int nUTF16Len = m_nOpFileByteLen / 2;
  1693. #if defined(MARKUP_WCHAR) // WCHAR
  1694. int nBufferSizeForGrow = nUTF16Len + nUTF16Len/100; // extra 1%
  1695. #if MARKUP_SIZEOFWCHAR == 4 // sizeof(wchar_t) == 4
  1696. unsigned short* pUTF16Buffer = new unsigned short[nUTF16Len+1];
  1697. bSuccess = FileRead( pUTF16Buffer );
  1698. if ( bSuccess )
  1699. {
  1700. if ( bCheckRaggedEnd )
  1701. FileCheckRaggedEnd( (void*)pUTF16Buffer );
  1702. TextEncoding textencoding( MCD_T("UTF-16"), (const void*)pUTF16Buffer, m_nOpFileTextLen );
  1703. textencoding.m_nToCount = nBufferSizeForGrow;
  1704. MCD_CHAR* pUTF32Buffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow);
  1705. int nUTF32Len = textencoding.PerformConversion( (void*)pUTF32Buffer, MCD_T("UTF-32") );
  1706. MCD_RELEASEBUFFER(strDoc,pUTF32Buffer,nUTF32Len);
  1707. x_AddResult( m_strIOResult, MCD_T("converted_to"), MCD_T("UTF-32"), MRC_ENCODING|MRC_LENGTH, nUTF32Len );
  1708. }
  1709. #else // sizeof(wchar_t) == 2
  1710. MCD_CHAR* pUTF16Buffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow);
  1711. bSuccess = FileRead( pUTF16Buffer );
  1712. if ( bSuccess && bCheckRaggedEnd )
  1713. FileCheckRaggedEnd( (void*)pUTF16Buffer );
  1714. MCD_RELEASEBUFFER(strDoc,pUTF16Buffer,m_nOpFileTextLen);
  1715. #endif // sizeof(wchar_t) == 2
  1716. #else // not WCHAR
  1717. // Convert file from UTF-16; it needs to be in memory as UTF-8 or MBCS
  1718. unsigned short* pUTF16Buffer = new unsigned short[nUTF16Len+1];
  1719. bSuccess = FileRead( pUTF16Buffer );
  1720. if ( bSuccess && bCheckRaggedEnd )
  1721. FileCheckRaggedEnd( (void*)pUTF16Buffer );
  1722. TextEncoding textencoding( MCD_T("UTF-16"), (const void*)pUTF16Buffer, m_nOpFileTextLen );
  1723. int nMBLen = textencoding.PerformConversion( NULL, MCD_ENC );
  1724. int nBufferSizeForGrow = nMBLen + nMBLen/100; // extra 1%
  1725. MCD_CHAR* pMBBuffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow);
  1726. textencoding.PerformConversion( (void*)pMBBuffer );
  1727. delete [] pUTF16Buffer;
  1728. MCD_RELEASEBUFFER(strDoc,pMBBuffer,nMBLen);
  1729. x_AddResult( m_strIOResult, MCD_T("converted_to"), MCD_ENC, MRC_ENCODING|MRC_LENGTH, nMBLen );
  1730. if ( textencoding.m_nFailedChars )
  1731. x_AddResult( m_strIOResult, MCD_T("conversion_loss") );
  1732. #endif // not WCHAR
  1733. }
  1734. else // single or multibyte file (i.e. not UTF-16)
  1735. {
  1736. #if defined(MARKUP_WCHAR) // WCHAR
  1737. char* pBuffer = new char[m_nOpFileByteLen];
  1738. bSuccess = FileRead( pBuffer );
  1739. if ( MCD_STRISEMPTY(m_strEncoding) )
  1740. {
  1741. int nNonASCII;
  1742. bool bErrorAtEnd;
  1743. if ( CMarkup::DetectUTF8(pBuffer,m_nOpFileByteLen,&nNonASCII,&bErrorAtEnd) || (bCheckRaggedEnd && bErrorAtEnd) )
  1744. {
  1745. m_strEncoding = MCD_T("UTF-8");
  1746. x_AddResult( m_strIOResult, MCD_T("read"), m_strEncoding, MRC_MODIFY|MRC_ENCODING );
  1747. }
  1748. x_AddResult( m_strIOResult, MCD_T("utf8_detection") );
  1749. }
  1750. if ( bSuccess && bCheckRaggedEnd )
  1751. FileCheckRaggedEnd( (void*)pBuffer );
  1752. TextEncoding textencoding( m_strEncoding, (const void*)pBuffer, m_nOpFileTextLen );
  1753. int nWideLen = textencoding.PerformConversion( NULL, MCD_ENC );
  1754. int nBufferSizeForGrow = nWideLen + nWideLen/100; // extra 1%
  1755. MCD_CHAR* pWideBuffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow);
  1756. textencoding.PerformConversion( (void*)pWideBuffer );
  1757. MCD_RELEASEBUFFER( strDoc, pWideBuffer, nWideLen );
  1758. delete [] pBuffer;
  1759. x_AddResult( m_strIOResult, MCD_T("converted_to"), MCD_ENC, MRC_ENCODING|MRC_LENGTH, nWideLen );
  1760. #else // not WCHAR
  1761. // After loading a file with unknown multi-byte encoding
  1762. bool bAssumeUnknownIsNative = false;
  1763. if ( MCD_STRISEMPTY(m_strEncoding) )
  1764. {
  1765. bAssumeUnknownIsNative = true;
  1766. m_strEncoding = MCD_ENC;
  1767. }
  1768. if ( TextEncoding::CanConvert(MCD_ENC,m_strEncoding) )
  1769. {
  1770. char* pBuffer = new char[m_nOpFileByteLen];
  1771. bSuccess = FileRead( pBuffer );
  1772. if ( bSuccess && bCheckRaggedEnd )
  1773. FileCheckRaggedEnd( (void*)pBuffer );
  1774. TextEncoding textencoding( m_strEncoding, (const void*)pBuffer, m_nOpFileTextLen );
  1775. int nMBLen = textencoding.PerformConversion( NULL, MCD_ENC );
  1776. int nBufferSizeForGrow = nMBLen + nMBLen/100; // extra 1%
  1777. MCD_CHAR* pMBBuffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow);
  1778. textencoding.PerformConversion( (void*)pMBBuffer );
  1779. MCD_RELEASEBUFFER( strDoc, pMBBuffer, nMBLen );
  1780. delete [] pBuffer;
  1781. x_AddResult( m_strIOResult, MCD_T("converted_to"), MCD_ENC, MRC_ENCODING|MRC_LENGTH, nMBLen );
  1782. if ( textencoding.m_nFailedChars )
  1783. x_AddResult( m_strIOResult, MCD_T("conversion_loss") );
  1784. }
  1785. else // load directly into string
  1786. {
  1787. int nBufferSizeForGrow = m_nOpFileByteLen + m_nOpFileByteLen/100; // extra 1%
  1788. MCD_CHAR* pBuffer = MCD_GETBUFFER(strDoc,nBufferSizeForGrow);
  1789. bSuccess = FileRead( pBuffer );
  1790. bool bConvertMB = false;
  1791. if ( bAssumeUnknownIsNative )
  1792. {
  1793. // Might need additional conversion if we assumed an encoding
  1794. int nNonASCII;
  1795. bool bErrorAtEnd;
  1796. bool bIsUTF8 = CMarkup::DetectUTF8( pBuffer, m_nOpFileByteLen, &nNonASCII, &bErrorAtEnd ) || (bCheckRaggedEnd && bErrorAtEnd);
  1797. MCD_STR strDetectedEncoding = bIsUTF8? MCD_T("UTF-8"): MCD_T("");
  1798. if ( nNonASCII && m_strEncoding != strDetectedEncoding ) // only need to convert non-ASCII
  1799. bConvertMB = true;
  1800. m_strEncoding = strDetectedEncoding;
  1801. if ( bIsUTF8 )
  1802. x_AddResult( m_strIOResult, MCD_T("read"), m_strEncoding, MRC_MODIFY|MRC_ENCODING );
  1803. }
  1804. if ( bSuccess && bCheckRaggedEnd )
  1805. FileCheckRaggedEnd( (void*)pBuffer );
  1806. MCD_RELEASEBUFFER( strDoc, pBuffer, m_nOpFileTextLen );
  1807. if ( bConvertMB )
  1808. {
  1809. TextEncoding textencoding( m_strEncoding, MCD_2PCSZ(strDoc), m_nOpFileTextLen );
  1810. int nMBLen = textencoding.PerformConversion( NULL, MCD_ENC );
  1811. nBufferSizeForGrow = nMBLen + nMBLen/100; // extra 1%
  1812. MCD_STR strConvDoc;
  1813. pBuffer = MCD_GETBUFFER(strConvDoc,nBufferSizeForGrow);
  1814. textencoding.PerformConversion( (void*)pBuffer );
  1815. MCD_RELEASEBUFFER( strConvDoc, pBuffer, nMBLen );
  1816. strDoc = strConvDoc;
  1817. x_AddResult( m_strIOResult, MCD_T("converted_to"), MCD_ENC, MRC_ENCODING|MRC_LENGTH, nMBLen );
  1818. if ( textencoding.m_nFailedChars )
  1819. x_AddResult( m_strIOResult, MCD_T("conversion_loss") );
  1820. }
  1821. if ( bAssumeUnknownIsNative )
  1822. x_AddResult( m_strIOResult, MCD_T("utf8_detection") );
  1823. }
  1824. #endif // not WCHAR
  1825. }
  1826. return bSuccess;
  1827. }
  1828. bool FilePos::FileWrite( void* pBuffer, const void* pConstBuffer /*=NULL*/ )
  1829. {
  1830. m_nOpFileByteLen = m_nOpFileTextLen * m_nFileCharUnitSize;
  1831. if ( ! pConstBuffer )
  1832. pConstBuffer = pBuffer;
  1833. unsigned short* pTempEndianBuffer = NULL;
  1834. if ( x_EndianSwapRequired(m_nDocFlags) )
  1835. {
  1836. if ( ! pBuffer )
  1837. {
  1838. pTempEndianBuffer = new unsigned short[m_nOpFileTextLen];
  1839. memcpy( pTempEndianBuffer, pConstBuffer, m_nOpFileTextLen * 2 );
  1840. pBuffer = pTempEndianBuffer;
  1841. pConstBuffer = pTempEndianBuffer;
  1842. }
  1843. x_EndianSwapUTF16( (unsigned short*)pBuffer, m_nOpFileTextLen );
  1844. x_AddResult( m_strIOResult, MCD_T("endian_swap") );
  1845. }
  1846. bool bSuccess = ( fwrite( pConstBuffer, m_nOpFileByteLen, 1, m_fp ) == 1 );
  1847. if ( pTempEndianBuffer )
  1848. delete [] pTempEndianBuffer;
  1849. if ( bSuccess )
  1850. {
  1851. m_nFileByteOffset += m_nOpFileByteLen;
  1852. x_AddResult( m_strIOResult, MCD_T("write"), m_strEncoding, MRC_ENCODING|MRC_LENGTH, m_nOpFileTextLen );
  1853. }
  1854. else
  1855. FileErrorAddResult();
  1856. return bSuccess;
  1857. }
  1858. bool FilePos::FileWriteText( const MCD_STR& strDoc, int nWriteStrLen/*=-1*/ )
  1859. {
  1860. bool bSuccess = true;
  1861. MCD_STRCLEAR( m_strIOResult );
  1862. MCD_PCSZ pDoc = MCD_2PCSZ(strDoc);
  1863. if ( nWriteStrLen == -1 )
  1864. nWriteStrLen = MCD_STRLENGTH(strDoc);
  1865. if ( ! nWriteStrLen )
  1866. {
  1867. x_AddResult( m_strIOResult, MCD_T("write"), m_strEncoding, MRC_ENCODING|MRC_LENGTH, 0 );
  1868. return bSuccess;
  1869. }
  1870. if ( m_nDocFlags & (CMarkup::MDF_UTF16LEFILE | CMarkup::MDF_UTF16BEFILE) )
  1871. {
  1872. #if defined(MARKUP_WCHAR) // WCHAR
  1873. #if MARKUP_SIZEOFWCHAR == 4 // sizeof(wchar_t) == 4
  1874. TextEncoding textencoding( MCD_T("UTF-32"), (const void*)pDoc, nWriteStrLen );
  1875. m_nOpFileTextLen = textencoding.PerformConversion( NULL, MCD_T("UTF-16") );
  1876. unsigned short* pUTF16Buffer = new unsigned short[m_nOpFileTextLen];
  1877. textencoding.PerformConversion( (void*)pUTF16Buffer );
  1878. x_AddResult( m_strIOResult, MCD_T("converted_from"), MCD_T("UTF-32"), MRC_ENCODING|MRC_LENGTH, nWriteStrLen );
  1879. bSuccess = FileWrite( pUTF16Buffer );
  1880. delete [] pUTF16Buffer;
  1881. #else // sizeof(wchar_t) == 2
  1882. m_nOpFileTextLen = nWriteStrLen;
  1883. bSuccess = FileWrite( NULL, pDoc );
  1884. #endif
  1885. #else // not WCHAR
  1886. TextEncoding textencoding( MCD_ENC, (const void*)pDoc, nWriteStrLen );
  1887. m_nOpFileTextLen = textencoding.PerformConversion( NULL, MCD_T("UTF-16") );
  1888. unsigned short* pUTF16Buffer = new unsigned short[m_nOpFileTextLen];
  1889. textencoding.PerformConversion( (void*)pUTF16Buffer );
  1890. x_AddResult( m_strIOResult, MCD_T("converted_from"), MCD_ENC, MRC_ENCODING|MRC_LENGTH, nWriteStrLen );
  1891. bSuccess = FileWrite( pUTF16Buffer );
  1892. delete [] pUTF16Buffer;
  1893. #endif // not WCHAR
  1894. }
  1895. else // single or multibyte file (i.e. not UTF-16)
  1896. {
  1897. #if ! defined(MARKUP_WCHAR) // not WCHAR
  1898. if ( ! TextEncoding::CanConvert(m_strEncoding,MCD_ENC) )
  1899. {
  1900. // Same or unsupported multi-byte to multi-byte, so save directly from string
  1901. m_nOpFileTextLen = nWriteStrLen;
  1902. bSuccess = FileWrite( NULL, pDoc );
  1903. return bSuccess;
  1904. }
  1905. #endif // not WCHAR
  1906. TextEncoding textencoding( MCD_ENC, (const void*)pDoc, nWriteStrLen );
  1907. m_nOpFileTextLen = textencoding.PerformConversion( NULL, m_strEncoding );
  1908. char* pMBBuffer = new char[m_nOpFileTextLen];
  1909. textencoding.PerformConversion( (void*)pMBBuffer );
  1910. x_AddResult( m_strIOResult, MCD_T("converted_from"), MCD_ENC, MRC_ENCODING|MRC_LENGTH, nWriteStrLen );
  1911. if ( textencoding.m_nFailedChars )
  1912. x_AddResult( m_strIOResult, MCD_T("conversion_loss") );
  1913. bSuccess = FileWrite( pMBBuffer );
  1914. delete [] pMBBuffer;
  1915. }
  1916. return bSuccess;
  1917. }
  1918. bool FilePos::FileClose()
  1919. {
  1920. if ( m_fp )
  1921. {
  1922. if ( fclose(m_fp) )
  1923. FileErrorAddResult();
  1924. m_fp = NULL;
  1925. m_nDocFlags &= ~(CMarkup::MDF_WRITEFILE|CMarkup::MDF_READFILE|CMarkup::MDF_APPENDFILE);
  1926. return true;
  1927. }
  1928. return false;
  1929. }
  1930. bool FilePos::FileReadNextBuffer()
  1931. {
  1932. // If not end of file, returns amount to subtract from offsets
  1933. if ( m_nFileByteOffset < m_nFileByteLen )
  1934. {
  1935. // Prepare to put this node at beginning
  1936. MCD_STR& str = *m_pstrBuffer;
  1937. int nDocLength = MCD_STRLENGTH( str );
  1938. int nRemove = m_nReadBufferStart;
  1939. m_nReadBufferRemoved = nRemove;
  1940. // Gather
  1941. if ( m_nReadGatherStart != -1 )
  1942. {
  1943. if ( m_nReadBufferStart > m_nReadGatherStart )
  1944. {
  1945. // In case it is a large subdoc, reduce reallocs by using x_StrInsertReplace
  1946. MCD_STR strAppend = MCD_STRMID( str, m_nReadGatherStart, m_nReadBufferStart - m_nReadGatherStart );
  1947. x_StrInsertReplace( m_strReadGatherMarkup, MCD_STRLENGTH(m_strReadGatherMarkup), 0, strAppend );
  1948. }
  1949. m_nReadGatherStart = 0;
  1950. }
  1951. // Increase capacity if keeping more than half of nDocLength
  1952. int nKeepLength = nDocLength - nRemove;
  1953. if ( nKeepLength > nDocLength / 2 )
  1954. m_nBlockSizeBasis *= 2;
  1955. if ( nRemove )
  1956. x_StrInsertReplace( str, 0, nRemove, MCD_STR() );
  1957. MCD_STR strRead;
  1958. m_nOpFileByteLen = m_nBlockSizeBasis - nKeepLength;
  1959. m_nOpFileByteLen += 4 - m_nOpFileByteLen % 4; // round up to 4-byte offset
  1960. FileReadText( strRead );
  1961. x_StrInsertReplace( str, nKeepLength, 0, strRead );
  1962. m_nReadBufferStart = 0; // next time just elongate/increase capacity
  1963. return true;
  1964. }
  1965. return false;
  1966. }
  1967. void FilePos::FileGatherStart( int nStart )
  1968. {
  1969. m_nReadGatherStart = nStart;
  1970. }
  1971. int FilePos::FileGatherEnd( MCD_STR& strMarkup )
  1972. {
  1973. int nStart = m_nReadGatherStart;
  1974. m_nReadGatherStart = -1;
  1975. strMarkup = m_strReadGatherMarkup;
  1976. MCD_STRCLEAR( m_strReadGatherMarkup );
  1977. return nStart;
  1978. }
  1979. bool FilePos::FileFlush( MCD_STR& strBuffer, int nWriteStrLen/*=-1*/, bool bFflush/*=false*/ )
  1980. {
  1981. bool bSuccess = true;
  1982. MCD_STRCLEAR( m_strIOResult );
  1983. if ( nWriteStrLen == -1 )
  1984. nWriteStrLen = MCD_STRLENGTH( strBuffer );
  1985. if ( nWriteStrLen )
  1986. {
  1987. if ( (! m_nFileByteOffset) && MCD_STRISEMPTY(m_strEncoding) && ! MCD_STRISEMPTY(strBuffer) )
  1988. {
  1989. m_strEncoding = CMarkup::GetDeclaredEncoding( strBuffer );
  1990. if ( MCD_STRISEMPTY(m_strEncoding) )
  1991. m_strEncoding = MCD_T("UTF-8");
  1992. }
  1993. bSuccess = FileWriteText( strBuffer, nWriteStrLen );
  1994. if ( bSuccess )
  1995. x_StrInsertReplace( strBuffer, 0, nWriteStrLen, MCD_STR() );
  1996. }
  1997. if ( bFflush && bSuccess )
  1998. {
  1999. if ( fflush(m_fp) )
  2000. bSuccess = FileErrorAddResult();
  2001. }
  2002. return bSuccess;
  2003. }
  2004. //////////////////////////////////////////////////////////////////////
  2005. // PathPos encapsulates parsing of the path string used in Find methods
  2006. //
  2007. struct PathPos
  2008. {
  2009. PathPos( MCD_PCSZ pszPath, bool b ) { p=pszPath; bReader=b; i=0; iPathAttribName=0; iSave=0; nPathType=0; if (!ParsePath()) nPathType=-1; };
  2010. int GetTypeAndInc() { i=-1; if (p) { if (p[0]=='/') { if (p[1]=='/') i=2; else i=1; } else if (p[0]) i=0; } nPathType=i+1; return nPathType; };
  2011. int GetNumAndInc() { int n=0; while (p[i]>='0'&&p[i]<='9') n=n*10+(int)p[i++]-(int)'0'; return n; };
  2012. MCD_PCSZ GetValAndInc() { ++i; MCD_CHAR cEnd=']'; if (p[i]=='\''||p[i]=='\"') cEnd=p[i++]; int iVal=i; IncWord(cEnd); nLen=i-iVal; if (cEnd!=']') ++i; return &p[iVal]; };
  2013. int GetValOrWordLen() { return nLen; };
  2014. MCD_CHAR GetChar() { return p[i]; };
  2015. bool IsAtPathEnd() { return ((!p[i])||(iPathAttribName&&i+2>=iPathAttribName))?true:false; };
  2016. MCD_PCSZ GetPtr() { return &p[i]; };
  2017. void SaveOffset() { iSave=i; };
  2018. void RevertOffset() { i=iSave; };
  2019. void RevertOffsetAsName() { i=iSave; nPathType=1; };
  2020. MCD_PCSZ GetWordAndInc() { int iWord=i; IncWord(); nLen=i-iWord; return &p[iWord]; };
  2021. void IncWord() { while (p[i]&&!x_ISENDPATHWORD(p[i])) i+=MCD_CLEN(&p[i]); };
  2022. void IncWord( MCD_CHAR c ) { while (p[i]&&p[i]!=c) i+=MCD_CLEN(&p[i]); };
  2023. void IncChar() { ++i; };
  2024. void Inc( int n ) { i+=n; };
  2025. bool IsAnywherePath() { return nPathType == 3; };
  2026. bool IsAbsolutePath() { return nPathType == 2; };
  2027. bool IsPath() { return nPathType > 0; };
  2028. bool ValidPath() { return nPathType != -1; };
  2029. MCD_PCSZ GetPathAttribName() { if (iPathAttribName) return &p[iPathAttribName]; return NULL; };
  2030. bool AttribPredicateMatch( TokenPos& token );
  2031. private:
  2032. bool ParsePath();
  2033. int nPathType; // -1 invalid, 0 empty, 1 name, 2 absolute path, 3 anywhere path
  2034. bool bReader;
  2035. MCD_PCSZ p;
  2036. int i;
  2037. int iPathAttribName;
  2038. int iSave;
  2039. int nLen;
  2040. };
  2041. bool PathPos::ParsePath()
  2042. {
  2043. // Determine if the path seems to be in a valid format before attempting to find
  2044. if ( GetTypeAndInc() )
  2045. {
  2046. SaveOffset();
  2047. while ( 1 )
  2048. {
  2049. if ( ! GetChar() )
  2050. return false;
  2051. IncWord(); // Tag name
  2052. if ( GetChar() == '[' ) // predicate
  2053. {
  2054. IncChar(); // [
  2055. if ( GetChar() >= '1' && GetChar() <= '9' )
  2056. GetNumAndInc();
  2057. else // attrib or child tag name
  2058. {
  2059. if ( GetChar() == '@' )
  2060. {
  2061. IncChar(); // @
  2062. IncWord(); // attrib name
  2063. if ( GetChar() == '=' )
  2064. GetValAndInc();
  2065. }
  2066. else
  2067. {
  2068. if ( bReader )
  2069. return false;
  2070. IncWord();
  2071. }
  2072. }
  2073. if ( GetChar() != ']' )
  2074. return false;
  2075. IncChar(); // ]
  2076. }
  2077. // Another level of path
  2078. if ( GetChar() == '/' )
  2079. {
  2080. if ( IsAnywherePath() )
  2081. return false; // multiple levels not supported for // path
  2082. IncChar();
  2083. if ( GetChar() == '@' )
  2084. {
  2085. // FindGetData and FindSetData support paths ending in attribute
  2086. IncChar(); // @
  2087. iPathAttribName = i;
  2088. IncWord(); // attrib name
  2089. if ( GetChar() )
  2090. return false; // it should have ended with attribute name
  2091. break;
  2092. }
  2093. }
  2094. else
  2095. {
  2096. if ( GetChar() )
  2097. return false; // not a slash, so it should have ended here
  2098. break;
  2099. }
  2100. }
  2101. RevertOffset();
  2102. }
  2103. return true;
  2104. }
  2105. bool PathPos::AttribPredicateMatch( TokenPos& token )
  2106. {
  2107. // Support attribute predicate matching in regular and file read mode
  2108. // token.m_nNext must already be set to node.nStart + 1 or ELEM(i).nStart + 1
  2109. IncChar(); // @
  2110. if ( token.FindAttrib(GetPtr()) )
  2111. {
  2112. IncWord();
  2113. if ( GetChar() == '=' )
  2114. {
  2115. MCD_PCSZ pszVal = GetValAndInc();
  2116. MCD_STR strPathValue = CMarkup::UnescapeText( pszVal, GetValOrWordLen() );
  2117. MCD_STR strAttribValue = CMarkup::UnescapeText( token.GetTokenPtr(), token.Length(), token.m_nTokenFlags );
  2118. if ( strPathValue != strAttribValue )
  2119. return false;
  2120. }
  2121. return true;
  2122. }
  2123. return false;
  2124. }
  2125. //////////////////////////////////////////////////////////////////////
  2126. // A map is a table of SavedPos structs
  2127. //
  2128. struct SavedPos
  2129. {
  2130. // SavedPos is an entry in the SavedPosMap hash table
  2131. SavedPos() { nSavedPosFlags=0; iPos=0; };
  2132. MCD_STR strName;
  2133. int iPos;
  2134. enum { SPM_MAIN = 1, SPM_CHILD = 2, SPM_USED = 4, SPM_LAST = 8 };
  2135. int nSavedPosFlags;
  2136. };
  2137. struct SavedPosMap
  2138. {
  2139. // SavedPosMap is only created if SavePos/RestorePos are used
  2140. SavedPosMap( int nSize ) { nMapSize=nSize; pTable = new SavedPos*[nSize]; memset(pTable,0,nSize*sizeof(SavedPos*)); };
  2141. ~SavedPosMap() { if (pTable) { for (int n=0;n<nMapSize;++n) if (pTable[n]) delete[] pTable[n]; delete[] pTable; } };
  2142. SavedPos** pTable;
  2143. int nMapSize;
  2144. };
  2145. struct SavedPosMapArray
  2146. {
  2147. // SavedPosMapArray keeps pointers to SavedPosMap instances
  2148. SavedPosMapArray() { m_pMaps = NULL; };
  2149. ~SavedPosMapArray() { ReleaseMaps(); };
  2150. void ReleaseMaps() { SavedPosMap**p = m_pMaps; if (p) { while (*p) delete *p++; delete[] m_pMaps; m_pMaps=NULL; } };
  2151. bool GetMap( SavedPosMap*& pMap, int nMap, int nMapSize = 7 );
  2152. void CopySavedPosMaps( SavedPosMapArray* pOtherMaps );
  2153. SavedPosMap** m_pMaps; // NULL terminated array
  2154. };
  2155. bool SavedPosMapArray::GetMap( SavedPosMap*& pMap, int nMap, int nMapSize /*=7*/ )
  2156. {
  2157. // Find or create map, returns true if map(s) created
  2158. SavedPosMap** pMapsExisting = m_pMaps;
  2159. int nMapIndex = 0;
  2160. if ( pMapsExisting )
  2161. {
  2162. // Length of array is unknown, so loop through maps
  2163. while ( nMapIndex <= nMap )
  2164. {
  2165. pMap = pMapsExisting[nMapIndex];
  2166. if ( ! pMap )
  2167. break;
  2168. if ( nMapIndex == nMap )
  2169. return false; // not created
  2170. ++nMapIndex;
  2171. }
  2172. nMapIndex = 0;
  2173. }
  2174. // Create map(s)
  2175. // If you access map 1 before map 0 created, then 2 maps will be created
  2176. m_pMaps = new SavedPosMap*[nMap+2];
  2177. if ( pMapsExisting )
  2178. {
  2179. while ( pMapsExisting[nMapIndex] )
  2180. {
  2181. m_pMaps[nMapIndex] = pMapsExisting[nMapIndex];
  2182. ++nMapIndex;
  2183. }
  2184. delete[] pMapsExisting;
  2185. }
  2186. while ( nMapIndex <= nMap )
  2187. {
  2188. m_pMaps[nMapIndex] = new SavedPosMap( nMapSize );
  2189. ++nMapIndex;
  2190. }
  2191. m_pMaps[nMapIndex] = NULL;
  2192. pMap = m_pMaps[nMap];
  2193. return true; // map(s) created
  2194. }
  2195. void SavedPosMapArray::CopySavedPosMaps( SavedPosMapArray* pOtherMaps )
  2196. {
  2197. ReleaseMaps();
  2198. if ( pOtherMaps->m_pMaps )
  2199. {
  2200. int nMap = 0;
  2201. SavedPosMap* pMap = NULL;
  2202. while ( pOtherMaps->m_pMaps[nMap] )
  2203. {
  2204. SavedPosMap* pMapSrc = pOtherMaps->m_pMaps[nMap];
  2205. GetMap( pMap, nMap, pMapSrc->nMapSize );
  2206. for ( int nSlot=0; nSlot < pMap->nMapSize; ++nSlot )
  2207. {
  2208. SavedPos* pCopySavedPos = pMapSrc->pTable[nSlot];
  2209. if ( pCopySavedPos )
  2210. {
  2211. int nCount = 0;
  2212. while ( pCopySavedPos[nCount].nSavedPosFlags & SavedPos::SPM_USED )
  2213. {
  2214. ++nCount;
  2215. if ( pCopySavedPos[nCount-1].nSavedPosFlags & SavedPos::SPM_LAST )
  2216. break;
  2217. }
  2218. if ( nCount )
  2219. {
  2220. SavedPos* pNewSavedPos = new SavedPos[nCount];
  2221. for ( int nCopy=0; nCopy<nCount; ++nCopy )
  2222. pNewSavedPos[nCopy] = pCopySavedPos[nCopy];
  2223. pNewSavedPos[nCount-1].nSavedPosFlags |= SavedPos::SPM_LAST;
  2224. pMap->pTable[nSlot] = pNewSavedPos;
  2225. }
  2226. }
  2227. }
  2228. ++nMap;
  2229. }
  2230. }
  2231. }
  2232. //////////////////////////////////////////////////////////////////////
  2233. // Core parser function
  2234. //
  2235. int TokenPos::ParseNode( NodePos& node )
  2236. {
  2237. // Call this with m_nNext set to the start of the node or tag
  2238. // Upon return m_nNext points to the char after the node or tag
  2239. // m_nL and m_nR are set to name location if it is a tag with a name
  2240. // node members set to node location, strMeta used for parse error
  2241. //
  2242. // <!--...--> comment
  2243. // <!DOCTYPE ...> dtd
  2244. // <?target ...?> processing instruction
  2245. // <![CDATA[...]]> cdata section
  2246. // <NAME ...> element start tag
  2247. // </NAME ...> element end tag
  2248. //
  2249. // returns the nodetype or
  2250. // 0 for end tag
  2251. // -1 for bad node
  2252. // -2 for end of document
  2253. //
  2254. enum ParseBits
  2255. {
  2256. PD_OPENTAG = 1,
  2257. PD_BANG = 2,
  2258. PD_DASH = 4,
  2259. PD_BRACKET = 8,
  2260. PD_TEXTORWS = 16,
  2261. PD_DOCTYPE = 32,
  2262. PD_INQUOTE_S = 64,
  2263. PD_INQUOTE_D = 128,
  2264. PD_EQUALS = 256,
  2265. PD_NOQUOTEVAL = 512
  2266. };
  2267. int nParseFlags = 0;
  2268. MCD_PCSZ pFindEnd = NULL;
  2269. int nNodeType = -1;
  2270. int nEndLen = 0;
  2271. int nName = 0;
  2272. int nNameLen = 0;
  2273. unsigned int cDminus1 = 0, cDminus2 = 0;
  2274. #define FINDNODETYPE(e,t) { pFindEnd=e; nEndLen=(sizeof(e)-1)/sizeof(MCD_CHAR); nNodeType=t; }
  2275. #define FINDNODETYPENAME(e,t,n) { FINDNODETYPE(e,t) nName=(int)(pD-m_pDocText)+n; }
  2276. #define FINDNODEBAD(e) { pFindEnd=MCD_T(">"); nEndLen=1; x_AddResult(node.strMeta,e,NULL,0,m_nNext); nNodeType=-1; }
  2277. node.nStart = m_nNext;
  2278. node.nNodeFlags = 0;
  2279. MCD_PCSZ pD = &m_pDocText[m_nNext];
  2280. unsigned int cD;
  2281. while ( 1 )
  2282. {
  2283. cD = (unsigned int)*pD;
  2284. if ( ! cD )
  2285. {
  2286. m_nNext = (int)(pD - m_pDocText);
  2287. if ( m_pReaderFilePos ) // read file mode
  2288. {
  2289. // Read buffer may only be removed on the first FileReadNextBuffer in this node
  2290. int nRemovedAlready = m_pReaderFilePos->m_nReadBufferRemoved;
  2291. if ( m_pReaderFilePos->FileReadNextBuffer() ) // more text in file?
  2292. {
  2293. int nNodeLength = m_nNext - node.nStart;
  2294. int nRemove = m_pReaderFilePos->m_nReadBufferRemoved;
  2295. if ( nRemove )
  2296. {
  2297. node.nStart -= nRemove;
  2298. if ( nName )
  2299. nName -= nRemove;
  2300. else if ( nNameLen )
  2301. {
  2302. m_nL -= nRemove;
  2303. m_nR -= nRemove;
  2304. }
  2305. m_nNext -= nRemove;
  2306. }
  2307. int nNewOffset = node.nStart + nNodeLength;
  2308. MCD_STR& str = *m_pReaderFilePos->m_pstrBuffer;
  2309. m_pDocText = MCD_2PCSZ( str );
  2310. pD = &m_pDocText[nNewOffset];
  2311. cD = (unsigned int)*pD; // loaded char replaces null terminator
  2312. }
  2313. if (nRemovedAlready) // preserve m_nReadBufferRemoved for caller of ParseNode
  2314. m_pReaderFilePos->m_nReadBufferRemoved = nRemovedAlready;
  2315. }
  2316. if ( ! cD )
  2317. {
  2318. if ( m_nNext == node.nStart )
  2319. {
  2320. node.nLength = 0;
  2321. node.nNodeType = 0;
  2322. return -2; // end of document
  2323. }
  2324. if ( nNodeType != CMarkup::MNT_WHITESPACE && nNodeType != CMarkup::MNT_TEXT )
  2325. {
  2326. MCD_PCSZ pType = MCD_T("tag");
  2327. if ( (nParseFlags & PD_DOCTYPE) || nNodeType == CMarkup::MNT_DOCUMENT_TYPE )
  2328. pType = MCD_T("document_type");
  2329. else if ( nNodeType == CMarkup::MNT_ELEMENT )
  2330. pType = MCD_T("start_tag");
  2331. else if ( nNodeType == 0 )
  2332. pType = MCD_T("end_tag");
  2333. else if ( nNodeType == CMarkup::MNT_CDATA_SECTION )
  2334. pType = MCD_T("cdata_section");
  2335. else if ( nNodeType == CMarkup::MNT_PROCESSING_INSTRUCTION )
  2336. pType = MCD_T("processing_instruction");
  2337. else if ( nNodeType == CMarkup::MNT_COMMENT )
  2338. pType = MCD_T("comment");
  2339. nNodeType = -1;
  2340. x_AddResult(node.strMeta,MCD_T("unterminated_tag_syntax"),pType,MRC_TYPE,node.nStart);
  2341. }
  2342. break;
  2343. }
  2344. }
  2345. if ( nName )
  2346. {
  2347. if ( x_ISENDNAME(cD) )
  2348. {
  2349. nNameLen = (int)(pD - m_pDocText) - nName;
  2350. m_nL = nName;
  2351. m_nR = nName + nNameLen - 1;
  2352. nName = 0;
  2353. cDminus2 = 0;
  2354. cDminus1 = 0;
  2355. }
  2356. else
  2357. {
  2358. pD += MCD_CLEN( pD );
  2359. continue;
  2360. }
  2361. }
  2362. if ( pFindEnd )
  2363. {
  2364. if ( cD == '>' && ! (nParseFlags & (PD_INQUOTE_S|PD_INQUOTE_D)) )
  2365. {
  2366. m_nNext = (int)(pD - m_pDocText) + 1;
  2367. if ( nEndLen == 1 )
  2368. {
  2369. pFindEnd = NULL;
  2370. if ( nNodeType == CMarkup::MNT_ELEMENT && cDminus1 == '/' )
  2371. {
  2372. if ( (! cDminus2) || (!(nParseFlags&PD_NOQUOTEVAL)) || x_ISNOTSECONDLASTINVAL(cDminus2) )
  2373. node.nNodeFlags |= MNF_EMPTY;
  2374. }
  2375. }
  2376. else if ( m_nNext - 1 > nEndLen )
  2377. {
  2378. // Test for end of PI or comment
  2379. MCD_PCSZ pEnd = pD - nEndLen + 1;
  2380. MCD_PCSZ pInFindEnd = pFindEnd;
  2381. int nLen = nEndLen;
  2382. while ( --nLen && *pEnd++ == *pInFindEnd++ );
  2383. if ( nLen == 0 )
  2384. pFindEnd = NULL;
  2385. }
  2386. nParseFlags &= ~PD_NOQUOTEVAL; // make sure PD_NOQUOTEVAL is off
  2387. if ( ! pFindEnd && ! (nParseFlags & PD_DOCTYPE) )
  2388. break;
  2389. }
  2390. else if ( cD == '<' && (nNodeType == CMarkup::MNT_TEXT || nNodeType == -1) )
  2391. {
  2392. m_nNext = (int)(pD - m_pDocText);
  2393. break;
  2394. }
  2395. else if ( nNodeType & CMarkup::MNT_ELEMENT )
  2396. {
  2397. if ( (nParseFlags & (PD_INQUOTE_S|PD_INQUOTE_D|PD_NOQUOTEVAL)) )
  2398. {
  2399. if ( cD == '\"' && (nParseFlags&PD_INQUOTE_D) )
  2400. nParseFlags ^= PD_INQUOTE_D; // off
  2401. else if ( cD == '\'' && (nParseFlags&PD_INQUOTE_S) )
  2402. nParseFlags ^= PD_INQUOTE_S; // off
  2403. else if ( (nParseFlags&PD_NOQUOTEVAL) && x_ISWHITESPACE(cD) )
  2404. nParseFlags ^= PD_NOQUOTEVAL; // off
  2405. }
  2406. else // not in attrib value
  2407. {
  2408. // Only set INQUOTE status when preceeded by equal sign
  2409. if ( cD == '\"' && (nParseFlags&PD_EQUALS) )
  2410. nParseFlags ^= PD_INQUOTE_D|PD_EQUALS; // D on, equals off
  2411. else if ( cD == '\'' && (nParseFlags&PD_EQUALS) )
  2412. nParseFlags ^= PD_INQUOTE_S|PD_EQUALS; // S on, equals off
  2413. else if ( cD == '=' && cDminus1 != '=' && ! (nParseFlags&PD_EQUALS) )
  2414. nParseFlags ^= PD_EQUALS; // on
  2415. else if ( (nParseFlags&PD_EQUALS) && ! x_ISWHITESPACE(cD) )
  2416. nParseFlags ^= PD_NOQUOTEVAL|PD_EQUALS; // no quote val on, equals off
  2417. }
  2418. cDminus2 = cDminus1;
  2419. cDminus1 = cD;
  2420. }
  2421. else if ( nNodeType & CMarkup::MNT_DOCUMENT_TYPE )
  2422. {
  2423. if ( cD == '\"' && ! (nParseFlags&PD_INQUOTE_S) )
  2424. nParseFlags ^= PD_INQUOTE_D; // toggle
  2425. else if ( cD == '\'' && ! (nParseFlags&PD_INQUOTE_D) )
  2426. nParseFlags ^= PD_INQUOTE_S; // toggle
  2427. }
  2428. }
  2429. else if ( nParseFlags )
  2430. {
  2431. if ( nParseFlags & PD_TEXTORWS )
  2432. {
  2433. if ( cD == '<' )
  2434. {
  2435. m_nNext = (int)(pD - m_pDocText);
  2436. nNodeType = CMarkup::MNT_WHITESPACE;
  2437. break;
  2438. }
  2439. else if ( ! x_ISWHITESPACE(cD) )
  2440. {
  2441. nParseFlags ^= PD_TEXTORWS;
  2442. FINDNODETYPE( MCD_T("<"), CMarkup::MNT_TEXT )
  2443. }
  2444. }
  2445. else if ( nParseFlags & PD_OPENTAG )
  2446. {
  2447. nParseFlags ^= PD_OPENTAG;
  2448. if ( cD > 0x60 || ( cD > 0x40 && cD < 0x5b ) || cD == 0x5f || cD == 0x3a )
  2449. FINDNODETYPENAME( MCD_T(">"), CMarkup::MNT_ELEMENT, 0 )
  2450. else if ( cD == '/' )
  2451. FINDNODETYPENAME( MCD_T(">"), 0, 1 )
  2452. else if ( cD == '!' )
  2453. nParseFlags |= PD_BANG;
  2454. else if ( cD == '?' )
  2455. FINDNODETYPENAME( MCD_T("?>"), CMarkup::MNT_PROCESSING_INSTRUCTION, 1 )
  2456. else
  2457. FINDNODEBAD( MCD_T("first_tag_syntax") )
  2458. }
  2459. else if ( nParseFlags & PD_BANG )
  2460. {
  2461. nParseFlags ^= PD_BANG;
  2462. if ( cD == '-' )
  2463. nParseFlags |= PD_DASH;
  2464. else if ( nParseFlags & PD_DOCTYPE )
  2465. {
  2466. if ( x_ISDOCTYPESTART(cD) ) // <!ELEMENT ATTLIST ENTITY NOTATION
  2467. FINDNODETYPE( MCD_T(">"), CMarkup::MNT_DOCUMENT_TYPE )
  2468. else
  2469. FINDNODEBAD( MCD_T("doctype_tag_syntax") )
  2470. }
  2471. else
  2472. {
  2473. if ( cD == '[' )
  2474. nParseFlags |= PD_BRACKET;
  2475. else if ( cD == 'D' )
  2476. nParseFlags |= PD_DOCTYPE;
  2477. else
  2478. FINDNODEBAD( MCD_T("exclamation_tag_syntax") )
  2479. }
  2480. }
  2481. else if ( nParseFlags & PD_DASH )
  2482. {
  2483. nParseFlags ^= PD_DASH;
  2484. if ( cD == '-' )
  2485. FINDNODETYPE( MCD_T("-->"), CMarkup::MNT_COMMENT )
  2486. else
  2487. FINDNODEBAD( MCD_T("comment_tag_syntax") )
  2488. }
  2489. else if ( nParseFlags & PD_BRACKET )
  2490. {
  2491. nParseFlags ^= PD_BRACKET;
  2492. if ( cD == 'C' )
  2493. FINDNODETYPE( MCD_T("]]>"), CMarkup::MNT_CDATA_SECTION )
  2494. else
  2495. FINDNODEBAD( MCD_T("cdata_section_syntax") )
  2496. }
  2497. else if ( nParseFlags & PD_DOCTYPE )
  2498. {
  2499. if ( cD == '<' )
  2500. nParseFlags |= PD_OPENTAG;
  2501. else if ( cD == '>' )
  2502. {
  2503. m_nNext = (int)(pD - m_pDocText) + 1;
  2504. nNodeType = CMarkup::MNT_DOCUMENT_TYPE;
  2505. break;
  2506. }
  2507. }
  2508. }
  2509. else if ( cD == '<' )
  2510. {
  2511. nParseFlags |= PD_OPENTAG;
  2512. }
  2513. else
  2514. {
  2515. nNodeType = CMarkup::MNT_WHITESPACE;
  2516. if ( x_ISWHITESPACE(cD) )
  2517. nParseFlags |= PD_TEXTORWS;
  2518. else
  2519. FINDNODETYPE( MCD_T("<"), CMarkup::MNT_TEXT )
  2520. }
  2521. pD += MCD_CLEN( pD );
  2522. }
  2523. node.nLength = m_nNext - node.nStart;
  2524. node.nNodeType = nNodeType;
  2525. return nNodeType;
  2526. }
  2527. //////////////////////////////////////////////////////////////////////
  2528. // CMarkup public methods
  2529. //
  2530. CMarkup::~CMarkup()
  2531. {
  2532. if ( m_pSavedPosMaps )
  2533. {
  2534. delete m_pSavedPosMaps;
  2535. m_pSavedPosMaps = NULL;
  2536. }
  2537. if ( m_pElemPosTree )
  2538. {
  2539. delete m_pElemPosTree;
  2540. m_pElemPosTree = NULL;
  2541. }
  2542. //_CrtDumpMemoryLeaks();
  2543. }
  2544. void CMarkup::operator=( const CMarkup& markup )
  2545. {
  2546. // Copying not supported during file mode because of file pointer
  2547. if ( (m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE)) || (markup.m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE)) )
  2548. return;
  2549. m_iPosParent = markup.m_iPosParent;
  2550. m_iPos = markup.m_iPos;
  2551. m_iPosChild = markup.m_iPosChild;
  2552. m_iPosFree = markup.m_iPosFree;
  2553. m_iPosDeleted = markup.m_iPosDeleted;
  2554. m_nNodeType = markup.m_nNodeType;
  2555. m_nNodeOffset = markup.m_nNodeOffset;
  2556. m_nNodeLength = markup.m_nNodeLength;
  2557. m_strDoc = markup.m_strDoc;
  2558. m_strResult = markup.m_strResult;
  2559. m_nDocFlags = markup.m_nDocFlags;
  2560. m_pElemPosTree->CopyElemPosTree( markup.m_pElemPosTree, m_iPosFree );
  2561. m_pSavedPosMaps->CopySavedPosMaps( markup.m_pSavedPosMaps );
  2562. MARKUP_SETDEBUGSTATE;
  2563. }
  2564. bool CMarkup::SetDoc( MCD_PCSZ pDoc )
  2565. {
  2566. // pDoc is markup text, not a filename!
  2567. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  2568. return false;
  2569. // Set document text
  2570. if ( pDoc )
  2571. m_strDoc = pDoc;
  2572. else
  2573. {
  2574. MCD_STRCLEARSIZE( m_strDoc );
  2575. m_pElemPosTree->ReleaseElemPosTree();
  2576. }
  2577. MCD_STRCLEAR(m_strResult);
  2578. return x_ParseDoc();
  2579. }
  2580. bool CMarkup::SetDoc( const MCD_STR& strDoc )
  2581. {
  2582. // strDoc is markup text, not a filename!
  2583. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  2584. return false;
  2585. m_strDoc = strDoc;
  2586. MCD_STRCLEAR(m_strResult);
  2587. return x_ParseDoc();
  2588. }
  2589. bool CMarkup::IsWellFormed()
  2590. {
  2591. if ( m_nDocFlags & MDF_WRITEFILE )
  2592. return true;
  2593. if ( m_nDocFlags & MDF_READFILE )
  2594. {
  2595. if ( ! (ELEM(0).nFlags & MNF_ILLFORMED) )
  2596. return true;
  2597. }
  2598. else if ( m_pElemPosTree->GetSize()
  2599. && ! (ELEM(0).nFlags & MNF_ILLFORMED)
  2600. && ELEM(0).iElemChild
  2601. && ! ELEM(ELEM(0).iElemChild).iElemNext )
  2602. return true;
  2603. return false;
  2604. }
  2605. MCD_STR CMarkup::GetError() const
  2606. {
  2607. // For backwards compatibility, return a readable English string built from m_strResult
  2608. // In release 11.0 you can use GetResult and examine result in XML format
  2609. CMarkup mResult( m_strResult );
  2610. MCD_STR strError;
  2611. int nSyntaxErrors = 0;
  2612. while ( mResult.FindElem() )
  2613. {
  2614. MCD_STR strItem;
  2615. MCD_STR strID = mResult.GetTagName();
  2616. // Parse result
  2617. if ( strID == MCD_T("root_has_sibling") )
  2618. strItem = MCD_T("root element has sibling");
  2619. else if ( strID == MCD_T("no_root_element") )
  2620. strItem = MCD_T("no root element");
  2621. else if ( strID == MCD_T("lone_end_tag") )
  2622. strItem = MCD_T("lone end tag '") + mResult.GetAttrib(MCD_T("tagname")) + MCD_T("' at offset ")
  2623. + mResult.GetAttrib(MCD_T("offset"));
  2624. else if ( strID == MCD_T("unended_start_tag") )
  2625. strItem = MCD_T("start tag '") + mResult.GetAttrib(MCD_T("tagname")) + MCD_T("' at offset ")
  2626. + mResult.GetAttrib(MCD_T("offset")) + MCD_T(" expecting end tag at offset ") + mResult.GetAttrib(MCD_T("offset2"));
  2627. else if ( strID == MCD_T("first_tag_syntax") )
  2628. strItem = MCD_T("tag syntax error at offset ") + mResult.GetAttrib(MCD_T("offset"))
  2629. + MCD_T(" expecting tag name / ! or ?");
  2630. else if ( strID == MCD_T("exclamation_tag_syntax") )
  2631. strItem = MCD_T("tag syntax error at offset ") + mResult.GetAttrib(MCD_T("offset"))
  2632. + MCD_T(" expecting 'DOCTYPE' [ or -");
  2633. else if ( strID == MCD_T("doctype_tag_syntax") )
  2634. strItem = MCD_T("tag syntax error at offset ") + mResult.GetAttrib(MCD_T("offset"))
  2635. + MCD_T(" expecting markup declaration"); // ELEMENT ATTLIST ENTITY NOTATION
  2636. else if ( strID == MCD_T("comment_tag_syntax") )
  2637. strItem = MCD_T("tag syntax error at offset ") + mResult.GetAttrib(MCD_T("offset"))
  2638. + MCD_T(" expecting - to begin comment");
  2639. else if ( strID == MCD_T("cdata_section_syntax") )
  2640. strItem = MCD_T("tag syntax error at offset ") + mResult.GetAttrib(MCD_T("offset"))
  2641. + MCD_T(" expecting 'CDATA'");
  2642. else if ( strID == MCD_T("unterminated_tag_syntax") )
  2643. strItem = MCD_T("unterminated tag at offset ") + mResult.GetAttrib(MCD_T("offset"));
  2644. // Report only the first syntax or well-formedness error
  2645. if ( ! MCD_STRISEMPTY(strItem) )
  2646. {
  2647. ++nSyntaxErrors;
  2648. if ( nSyntaxErrors > 1 )
  2649. continue;
  2650. }
  2651. // I/O results
  2652. if ( strID == MCD_T("file_error") )
  2653. strItem = mResult.GetAttrib(MCD_T("msg"));
  2654. else if ( strID == MCD_T("bom") )
  2655. strItem = MCD_T("BOM +");
  2656. else if ( strID == MCD_T("read") || strID == MCD_T("write") || strID == MCD_T("converted_to") || strID == MCD_T("converted_from") )
  2657. {
  2658. if ( strID == MCD_T("converted_to") )
  2659. strItem = MCD_T("to ");
  2660. MCD_STR strEncoding = mResult.GetAttrib( MCD_T("encoding") );
  2661. if ( ! MCD_STRISEMPTY(strEncoding) )
  2662. strItem += strEncoding + MCD_T(" ");
  2663. strItem += MCD_T("length ") + mResult.GetAttrib(MCD_T("length"));
  2664. if ( strID == MCD_T("converted_from") )
  2665. strItem += MCD_T(" to");
  2666. }
  2667. else if ( strID == MCD_T("nulls_removed") )
  2668. strItem = MCD_T("removed ") + mResult.GetAttrib(MCD_T("count")) + MCD_T(" nulls");
  2669. else if ( strID == MCD_T("conversion_loss") )
  2670. strItem = MCD_T("(chars lost in conversion!)");
  2671. else if ( strID == MCD_T("utf8_detection") )
  2672. strItem = MCD_T("(used UTF-8 detection)");
  2673. else if ( strID == MCD_T("endian_swap") )
  2674. strItem = MCD_T("endian swap");
  2675. else if ( strID == MCD_T("truncation_error") )
  2676. strItem = MCD_T("encoding ") + mResult.GetAttrib(MCD_T("encoding")) + MCD_T(" adjustment error");
  2677. // Concatenate result item to error string
  2678. if ( ! MCD_STRISEMPTY(strItem) )
  2679. {
  2680. if ( ! MCD_STRISEMPTY(strError) )
  2681. strError += MCD_T(" ");
  2682. strError += strItem;
  2683. }
  2684. }
  2685. return strError;
  2686. }
  2687. bool CMarkup::Load( MCD_CSTR_FILENAME szFileName )
  2688. {
  2689. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  2690. return false;
  2691. if ( ! ReadTextFile(szFileName, m_strDoc, &m_strResult, &m_nDocFlags) )
  2692. return false;
  2693. return x_ParseDoc();
  2694. }
  2695. bool CMarkup::ReadTextFile( MCD_CSTR_FILENAME szFileName, MCD_STR& strDoc, MCD_STR* pstrResult, int* pnDocFlags, MCD_STR* pstrEncoding )
  2696. {
  2697. // Static utility method to load text file into strDoc
  2698. //
  2699. FilePos file;
  2700. file.m_nDocFlags = (pnDocFlags?*pnDocFlags:0) | MDF_READFILE;
  2701. bool bSuccess = file.FileOpen( szFileName );
  2702. if ( pstrResult )
  2703. *pstrResult = file.m_strIOResult;
  2704. MCD_STRCLEAR(strDoc);
  2705. if ( bSuccess )
  2706. {
  2707. file.FileSpecifyEncoding( pstrEncoding );
  2708. file.m_nOpFileByteLen = (int)((MCD_INTFILEOFFSET)(file.m_nFileByteLen - file.m_nFileByteOffset));
  2709. bSuccess = file.FileReadText( strDoc );
  2710. file.FileClose();
  2711. if ( pstrResult )
  2712. *pstrResult += file.m_strIOResult;
  2713. if ( pnDocFlags )
  2714. *pnDocFlags = file.m_nDocFlags;
  2715. }
  2716. return bSuccess;
  2717. }
  2718. bool CMarkup::Save( MCD_CSTR_FILENAME szFileName )
  2719. {
  2720. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  2721. return false;
  2722. return WriteTextFile( szFileName, m_strDoc, &m_strResult, &m_nDocFlags );
  2723. }
  2724. bool CMarkup::WriteTextFile( MCD_CSTR_FILENAME szFileName, const MCD_STR& strDoc, MCD_STR* pstrResult, int* pnDocFlags, MCD_STR* pstrEncoding )
  2725. {
  2726. // Static utility method to save strDoc to text file
  2727. //
  2728. FilePos file;
  2729. file.m_nDocFlags = (pnDocFlags?*pnDocFlags:0) | MDF_WRITEFILE;
  2730. bool bSuccess = file.FileOpen( szFileName );
  2731. if ( pstrResult )
  2732. *pstrResult = file.m_strIOResult;
  2733. if ( bSuccess )
  2734. {
  2735. if ( MCD_STRISEMPTY(file.m_strEncoding) && ! MCD_STRISEMPTY(strDoc) )
  2736. {
  2737. file.m_strEncoding = GetDeclaredEncoding( strDoc );
  2738. if ( MCD_STRISEMPTY(file.m_strEncoding) )
  2739. file.m_strEncoding = MCD_T("UTF-8"); // to do: MDF_ANSIFILE
  2740. }
  2741. file.FileSpecifyEncoding( pstrEncoding );
  2742. bSuccess = file.FileWriteText( strDoc );
  2743. file.FileClose();
  2744. if ( pstrResult )
  2745. *pstrResult += file.m_strIOResult;
  2746. if ( pnDocFlags )
  2747. *pnDocFlags = file.m_nDocFlags;
  2748. }
  2749. return bSuccess;
  2750. }
  2751. bool CMarkup::FindElem( MCD_CSTR szName )
  2752. {
  2753. if ( m_nDocFlags & MDF_WRITEFILE )
  2754. return false;
  2755. if ( m_pElemPosTree->GetSize() )
  2756. {
  2757. // Change current position only if found
  2758. PathPos path( szName, false );
  2759. int iPos = x_FindElem( m_iPosParent, m_iPos, path );
  2760. if ( iPos )
  2761. {
  2762. // Assign new position
  2763. x_SetPos( ELEM(iPos).iElemParent, iPos, 0 );
  2764. return true;
  2765. }
  2766. }
  2767. return false;
  2768. }
  2769. bool CMarkup::FindChildElem( MCD_CSTR szName )
  2770. {
  2771. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  2772. return false;
  2773. // Shorthand: if no current main position, find first child under parent element
  2774. if ( ! m_iPos )
  2775. FindElem();
  2776. // Change current child position only if found
  2777. PathPos path( szName, false );
  2778. int iPosChild = x_FindElem( m_iPos, m_iPosChild, path );
  2779. if ( iPosChild )
  2780. {
  2781. // Assign new position
  2782. int iPos = ELEM(iPosChild).iElemParent;
  2783. x_SetPos( ELEM(iPos).iElemParent, iPos, iPosChild );
  2784. return true;
  2785. }
  2786. return false;
  2787. }
  2788. MCD_STR CMarkup::EscapeText( MCD_CSTR szText, int nFlags )
  2789. {
  2790. // Convert text as seen outside XML document to XML friendly
  2791. // replacing special characters with ampersand escape codes
  2792. // E.g. convert "6>7" to "6&gt;7"
  2793. //
  2794. // &lt; less than
  2795. // &amp; ampersand
  2796. // &gt; greater than
  2797. //
  2798. // and for attributes:
  2799. //
  2800. // &apos; apostrophe or single quote
  2801. // &quot; double quote
  2802. //
  2803. static MCD_PCSZ apReplace[] = { NULL,MCD_T("&lt;"),MCD_T("&amp;"),MCD_T("&gt;"),MCD_T("&quot;"),MCD_T("&apos;") };
  2804. MCD_STR strText;
  2805. MCD_PCSZ pSource = szText;
  2806. int nDestSize = MCD_PSZLEN(pSource);
  2807. nDestSize += nDestSize / 10 + 7;
  2808. MCD_BLDRESERVE(strText,nDestSize);
  2809. MCD_CHAR cSource = *pSource;
  2810. int nFound;
  2811. int nCharLen;
  2812. while ( cSource )
  2813. {
  2814. MCD_BLDCHECK(strText,nDestSize,6);
  2815. nFound = ((nFlags&MNF_ESCAPEQUOTES)?x_ISATTRIBSPECIAL(cSource):x_ISSPECIAL(cSource));
  2816. if ( nFound )
  2817. {
  2818. bool bIgnoreAmpersand = false;
  2819. if ( (nFlags&MNF_WITHREFS) && cSource == '&' )
  2820. {
  2821. // Do not replace ampersand if it is start of any entity reference
  2822. // &[#_:A-Za-zU][_:-.A-Za-z0-9U]*; where U is > 0x7f
  2823. MCD_PCSZ pCheckEntity = pSource;
  2824. ++pCheckEntity;
  2825. MCD_CHAR c = *pCheckEntity;
  2826. if ( x_ISSTARTENTREF(c) || ((unsigned int)c)>0x7f )
  2827. {
  2828. while ( 1 )
  2829. {
  2830. pCheckEntity += MCD_CLEN( pCheckEntity );
  2831. c = *pCheckEntity;
  2832. if ( c == ';' )
  2833. {
  2834. int nEntityLen = (int)(pCheckEntity - pSource) + 1;
  2835. MCD_BLDAPPENDN(strText,pSource,nEntityLen);
  2836. pSource = pCheckEntity;
  2837. bIgnoreAmpersand = true;
  2838. }
  2839. else if ( x_ISINENTREF(c) || ((unsigned int)c)>0x7f )
  2840. continue;
  2841. break;
  2842. }
  2843. }
  2844. }
  2845. if ( ! bIgnoreAmpersand )
  2846. {
  2847. MCD_BLDAPPEND(strText,apReplace[nFound]);
  2848. }
  2849. ++pSource; // ASCII, so 1 byte
  2850. }
  2851. else
  2852. {
  2853. nCharLen = MCD_CLEN( pSource );
  2854. MCD_BLDAPPENDN(strText,pSource,nCharLen);
  2855. pSource += nCharLen;
  2856. }
  2857. cSource = *pSource;
  2858. }
  2859. MCD_BLDRELEASE(strText);
  2860. return strText;
  2861. }
  2862. // Predefined character entities
  2863. // By default UnescapeText will decode standard HTML entities as well as the 5 in XML
  2864. // To unescape only the 5 standard XML entities, use this short table instead:
  2865. // MCD_PCSZ PredefEntityTable[4] =
  2866. // { MCD_T("20060lt"),MCD_T("40034quot"),MCD_T("30038amp"),MCD_T("20062gt40039apos") };
  2867. //
  2868. // This is a precompiled ASCII hash table for speed and minimum memory requirement
  2869. // Each entry consists of a 1 digit code name length, 4 digit code point, and the code name
  2870. // Each table slot can have multiple entries, table size 130 was chosen for even distribution
  2871. //
  2872. MCD_PCSZ PredefEntityTable[130] =
  2873. {
  2874. MCD_T("60216oslash60217ugrave60248oslash60249ugrave"),
  2875. MCD_T("50937omega60221yacute58968lceil50969omega60253yacute"),
  2876. MCD_T("50916delta50206icirc50948delta50238icirc68472weierp"),MCD_T("40185sup1"),
  2877. MCD_T("68970lfloor40178sup2"),
  2878. MCD_T("50922kappa60164curren50954kappa58212mdash40179sup3"),
  2879. MCD_T("59830diams58211ndash"),MCD_T("68855otimes58969rceil"),
  2880. MCD_T("50338oelig50212ocirc50244ocirc50339oelig58482trade"),
  2881. MCD_T("50197aring50931sigma50229aring50963sigma"),
  2882. MCD_T("50180acute68971rfloor50732tilde"),MCD_T("68249lsaquo"),
  2883. MCD_T("58734infin68201thinsp"),MCD_T("50161iexcl"),
  2884. MCD_T("50920theta50219ucirc50952theta50251ucirc"),MCD_T("58254oline"),
  2885. MCD_T("58260frasl68727lowast"),MCD_T("59827clubs60191iquest68250rsaquo"),
  2886. MCD_T("58629crarr50181micro"),MCD_T("58222bdquo"),MCD_T(""),
  2887. MCD_T("58243prime60177plusmn58242prime"),MCD_T("40914beta40946beta"),MCD_T(""),
  2888. MCD_T(""),MCD_T(""),MCD_T("50171laquo50215times"),MCD_T("40710circ"),
  2889. MCD_T("49001lang"),MCD_T("58220ldquo40175macr"),
  2890. MCD_T("40182para50163pound48476real"),MCD_T(""),MCD_T("58713notin50187raquo"),
  2891. MCD_T("48773cong50223szlig50978upsih"),
  2892. MCD_T("58776asymp58801equiv49002rang58218sbquo"),
  2893. MCD_T("50222thorn48659darr48595darr40402fnof58221rdquo50254thorn"),
  2894. MCD_T("40162cent58722minus"),MCD_T("58707exist40170ordf"),MCD_T(""),
  2895. MCD_T("40921iota58709empty48660harr48596harr40953iota"),MCD_T(""),
  2896. MCD_T("40196auml40228auml48226bull40167sect48838sube"),MCD_T(""),
  2897. MCD_T("48656larr48592larr58853oplus"),MCD_T("30176deg58216lsquo40186ordm"),
  2898. MCD_T("40203euml40039apos40235euml48712isin40160nbsp"),
  2899. MCD_T("40918zeta40950zeta"),MCD_T("38743and48195emsp48719prod"),
  2900. MCD_T("30935chi38745cap30967chi48194ensp"),
  2901. MCD_T("40207iuml40239iuml48706part48869perp48658rarr48594rarr"),
  2902. MCD_T("38736ang48836nsub58217rsquo"),MCD_T(""),
  2903. MCD_T("48901sdot48657uarr48593uarr"),MCD_T("40169copy48364euro"),
  2904. MCD_T("30919eta30951eta"),MCD_T("40214ouml40246ouml48839supe"),MCD_T(""),
  2905. MCD_T(""),MCD_T("30038amp30174reg"),MCD_T("48733prop"),MCD_T(""),
  2906. MCD_T("30208eth30934phi40220uuml30240eth30966phi40252uuml"),MCD_T(""),MCD_T(""),
  2907. MCD_T(""),MCD_T("40376yuml40255yuml"),MCD_T(""),MCD_T("40034quot48204zwnj"),
  2908. MCD_T("38746cup68756there4"),MCD_T("30929rho30961rho38764sim"),
  2909. MCD_T("30932tau38834sub30964tau"),MCD_T("38747int38206lrm38207rlm"),
  2910. MCD_T("30936psi30968psi30165yen"),MCD_T(""),MCD_T("28805ge30168uml"),
  2911. MCD_T("30982piv"),MCD_T(""),MCD_T("30172not"),MCD_T(""),MCD_T("28804le"),
  2912. MCD_T("30173shy"),MCD_T("39674loz28800ne38721sum"),MCD_T(""),MCD_T(""),
  2913. MCD_T("38835sup"),MCD_T("28715ni"),MCD_T(""),MCD_T("20928pi20960pi38205zwj"),
  2914. MCD_T(""),MCD_T("60923lambda20062gt60955lambda"),MCD_T(""),MCD_T(""),
  2915. MCD_T("60199ccedil60231ccedil"),MCD_T(""),MCD_T("20060lt"),
  2916. MCD_T("20926xi28744or20958xi"),MCD_T("20924mu20956mu"),MCD_T("20925nu20957nu"),
  2917. MCD_T("68225dagger68224dagger"),MCD_T("80977thetasym"),MCD_T(""),MCD_T(""),
  2918. MCD_T(""),MCD_T("78501alefsym"),MCD_T(""),MCD_T(""),MCD_T(""),
  2919. MCD_T("60193aacute60195atilde60225aacute60227atilde"),MCD_T(""),
  2920. MCD_T("70927omicron60247divide70959omicron"),MCD_T("60192agrave60224agrave"),
  2921. MCD_T("60201eacute60233eacute60962sigmaf"),MCD_T("70917epsilon70949epsilon"),
  2922. MCD_T(""),MCD_T("60200egrave60232egrave"),MCD_T("60205iacute60237iacute"),
  2923. MCD_T(""),MCD_T(""),MCD_T("60204igrave68230hellip60236igrave"),
  2924. MCD_T("60166brvbar"),
  2925. MCD_T("60209ntilde68704forall58711nabla60241ntilde69824spades"),
  2926. MCD_T("60211oacute60213otilde60189frac1260183middot60243oacute60245otilde"),
  2927. MCD_T(""),MCD_T("50184cedil60188frac14"),
  2928. MCD_T("50198aelig50194acirc60210ograve50226acirc50230aelig60242ograve"),
  2929. MCD_T("50915gamma60190frac3450947gamma58465image58730radic"),
  2930. MCD_T("60352scaron60353scaron"),MCD_T("60218uacute69829hearts60250uacute"),
  2931. MCD_T("50913alpha50202ecirc70933upsilon50945alpha50234ecirc70965upsilon"),
  2932. MCD_T("68240permil")
  2933. };
  2934. MCD_STR CMarkup::UnescapeText( MCD_CSTR szText, int nTextLength /*=-1*/, int nFlags /*=0*/ )
  2935. {
  2936. // Convert XML friendly text to text as seen outside XML document
  2937. // ampersand escape codes replaced with special characters e.g. convert "6&gt;7" to "6>7"
  2938. // ampersand numeric codes replaced with character e.g. convert &#60; to <
  2939. // Conveniently the result is always the same or shorter in byte length
  2940. //
  2941. MCD_STR strText;
  2942. MCD_PCSZ pSource = szText;
  2943. if ( nTextLength == -1 )
  2944. nTextLength = MCD_PSZLEN(szText);
  2945. MCD_BLDRESERVE(strText,nTextLength);
  2946. MCD_CHAR szCodeName[10];
  2947. bool bAlterWhitespace = (nFlags & (MDF_TRIMWHITESPACE|MDF_COLLAPSEWHITESPACE))?true:false;
  2948. bool bCollapseWhitespace = (nFlags & MDF_COLLAPSEWHITESPACE)?true:false;
  2949. int nCharWhitespace = -1; // start of string
  2950. int nCharLen;
  2951. int nChar = 0;
  2952. while ( nChar < nTextLength )
  2953. {
  2954. if ( pSource[nChar] == '&' )
  2955. {
  2956. if ( bAlterWhitespace )
  2957. nCharWhitespace = 0;
  2958. // Get corresponding unicode code point
  2959. int nUnicode = 0;
  2960. // Look for terminating semi-colon within 9 ASCII characters
  2961. int nCodeLen = 0;
  2962. MCD_CHAR cCodeChar = pSource[nChar+1];
  2963. while ( nCodeLen < 9 && ((unsigned int)cCodeChar) < 128 && cCodeChar != ';' )
  2964. {
  2965. if ( cCodeChar >= 'A' && cCodeChar <= 'Z') // upper case?
  2966. cCodeChar += ('a' - 'A'); // make lower case
  2967. szCodeName[nCodeLen] = cCodeChar;
  2968. ++nCodeLen;
  2969. cCodeChar = pSource[nChar+1+nCodeLen];
  2970. }
  2971. if ( cCodeChar == ';' ) // found semi-colon?
  2972. {
  2973. // Decode szCodeName
  2974. szCodeName[nCodeLen] = '\0';
  2975. if ( *szCodeName == '#' ) // numeric character reference?
  2976. {
  2977. // Is it a hex number?
  2978. int nBase = 10; // decimal
  2979. int nNumberOffset = 1; // after #
  2980. if ( szCodeName[1] == 'x' )
  2981. {
  2982. nNumberOffset = 2; // after #x
  2983. nBase = 16; // hex
  2984. }
  2985. nUnicode = MCD_PSZTOL( &szCodeName[nNumberOffset], NULL, nBase );
  2986. }
  2987. else // does not start with #
  2988. {
  2989. // Look for matching code name in PredefEntityTable
  2990. MCD_PCSZ pEntry = PredefEntityTable[x_Hash(szCodeName,sizeof(PredefEntityTable)/sizeof(MCD_PCSZ))];
  2991. while ( *pEntry )
  2992. {
  2993. // e.g. entry: 40039apos means length 4, code point 0039, code name apos
  2994. int nEntryLen = (*pEntry - '0');
  2995. ++pEntry;
  2996. MCD_PCSZ pCodePoint = pEntry;
  2997. pEntry += 4;
  2998. if ( nEntryLen == nCodeLen && x_StrNCmp(szCodeName,pEntry,nEntryLen) == 0 )
  2999. {
  3000. // Convert digits to integer up to code name which always starts with alpha
  3001. nUnicode = MCD_PSZTOL( pCodePoint, NULL, 10 );
  3002. break;
  3003. }
  3004. pEntry += nEntryLen;
  3005. }
  3006. }
  3007. }
  3008. // If a code point found, encode it into text
  3009. if ( nUnicode )
  3010. {
  3011. MCD_CHAR szChar[5];
  3012. nCharLen = 0;
  3013. #if defined(MARKUP_WCHAR) // WCHAR
  3014. #if MARKUP_SIZEOFWCHAR == 4 // sizeof(wchar_t) == 4
  3015. szChar[0] = (MCD_CHAR)nUnicode;
  3016. nCharLen = 1;
  3017. #else // sizeof(wchar_t) == 2
  3018. EncodeCharUTF16( nUnicode, (unsigned short*)szChar, nCharLen );
  3019. #endif
  3020. #elif defined(MARKUP_MBCS) // MBCS/double byte
  3021. #if defined(MARKUP_WINCONV)
  3022. int nUsedDefaultChar = 0;
  3023. wchar_t wszUTF16[2];
  3024. EncodeCharUTF16( nUnicode, (unsigned short*)wszUTF16, nCharLen );
  3025. nCharLen = WideCharToMultiByte( CP_ACP, 0, wszUTF16, nCharLen, szChar, 5, NULL, &nUsedDefaultChar );
  3026. if ( nUsedDefaultChar || nCharLen <= 0 )
  3027. nUnicode = 0;
  3028. #else // not WINCONV
  3029. wchar_t wcUnicode = (wchar_t)nUnicode;
  3030. nCharLen = wctomb( szChar, wcUnicode );
  3031. if ( nCharLen <= 0 )
  3032. nUnicode = 0;
  3033. #endif // not WINCONV
  3034. #else // not WCHAR and not MBCS/double byte
  3035. EncodeCharUTF8( nUnicode, szChar, nCharLen );
  3036. #endif // not WCHAR and not MBCS/double byte
  3037. // Increment index past ampersand semi-colon
  3038. if ( nUnicode ) // must check since MBCS case can clear it
  3039. {
  3040. MCD_BLDAPPENDN(strText,szChar,nCharLen);
  3041. nChar += nCodeLen + 2;
  3042. }
  3043. }
  3044. if ( ! nUnicode )
  3045. {
  3046. // If the code is not converted, leave it as is
  3047. MCD_BLDAPPEND1(strText,'&');
  3048. ++nChar;
  3049. }
  3050. }
  3051. else if ( bAlterWhitespace && x_ISWHITESPACE(pSource[nChar]) )
  3052. {
  3053. if ( nCharWhitespace == 0 && bCollapseWhitespace )
  3054. {
  3055. nCharWhitespace = MCD_BLDLEN(strText);
  3056. MCD_BLDAPPEND1(strText,' ');
  3057. }
  3058. else if ( nCharWhitespace != -1 && ! bCollapseWhitespace )
  3059. {
  3060. if ( nCharWhitespace == 0 )
  3061. nCharWhitespace = MCD_BLDLEN(strText);
  3062. MCD_BLDAPPEND1(strText,pSource[nChar]);
  3063. }
  3064. ++nChar;
  3065. }
  3066. else // not &
  3067. {
  3068. if ( bAlterWhitespace )
  3069. nCharWhitespace = 0;
  3070. nCharLen = MCD_CLEN(&pSource[nChar]);
  3071. MCD_BLDAPPENDN(strText,&pSource[nChar],nCharLen);
  3072. nChar += nCharLen;
  3073. }
  3074. }
  3075. if ( bAlterWhitespace && nCharWhitespace > 0 )
  3076. {
  3077. MCD_BLDTRUNC(strText,nCharWhitespace);
  3078. }
  3079. MCD_BLDRELEASE(strText);
  3080. return strText;
  3081. }
  3082. bool CMarkup::DetectUTF8( const char* pText, int nTextLen, int* pnNonASCII/*=NULL*/, bool* bErrorAtEnd/*=NULL*/ )
  3083. {
  3084. // return true if ASCII or all non-ASCII byte sequences are valid UTF-8 pattern:
  3085. // ASCII 0xxxxxxx
  3086. // 2-byte 110xxxxx 10xxxxxx
  3087. // 3-byte 1110xxxx 10xxxxxx 10xxxxxx
  3088. // 4-byte 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  3089. // *pnNonASCII is set (if pnNonASCII is not NULL) to the number of non-ASCII UTF-8 sequences
  3090. // or if an invalid UTF-8 sequence is found, to 1 + the valid non-ASCII sequences up to the invalid sequence
  3091. // *bErrorAtEnd is set (if bErrorAtEnd is not NULL) to true if the UTF-8 was cut off at the end in mid valid sequence
  3092. int nUChar;
  3093. if ( pnNonASCII )
  3094. *pnNonASCII = 0;
  3095. const char* pTextEnd = pText + nTextLen;
  3096. while ( *pText && pText != pTextEnd )
  3097. {
  3098. if ( (unsigned char)(*pText) & 0x80 )
  3099. {
  3100. if ( pnNonASCII )
  3101. ++(*pnNonASCII);
  3102. nUChar = DecodeCharUTF8( pText, pTextEnd );
  3103. if ( nUChar == -1 )
  3104. {
  3105. if ( bErrorAtEnd )
  3106. *bErrorAtEnd = (pTextEnd == pText)? true:false;
  3107. return false;
  3108. }
  3109. }
  3110. else
  3111. ++pText;
  3112. }
  3113. if ( bErrorAtEnd )
  3114. *bErrorAtEnd = false;
  3115. return true;
  3116. }
  3117. int CMarkup::DecodeCharUTF8( const char*& pszUTF8, const char* pszUTF8End/*=NULL*/ )
  3118. {
  3119. // Return Unicode code point and increment pszUTF8 past 1-4 bytes
  3120. // pszUTF8End can be NULL if pszUTF8 is null terminated
  3121. int nUChar = (unsigned char)*pszUTF8;
  3122. ++pszUTF8;
  3123. if ( nUChar & 0x80 )
  3124. {
  3125. int nExtraChars;
  3126. if ( ! (nUChar & 0x20) )
  3127. {
  3128. nExtraChars = 1;
  3129. nUChar &= 0x1f;
  3130. }
  3131. else if ( ! (nUChar & 0x10) )
  3132. {
  3133. nExtraChars = 2;
  3134. nUChar &= 0x0f;
  3135. }
  3136. else if ( ! (nUChar & 0x08) )
  3137. {
  3138. nExtraChars = 3;
  3139. nUChar &= 0x07;
  3140. }
  3141. else
  3142. return -1;
  3143. while ( nExtraChars-- )
  3144. {
  3145. if ( pszUTF8 == pszUTF8End || ! (*pszUTF8 & 0x80) )
  3146. return -1;
  3147. nUChar = nUChar<<6;
  3148. nUChar |= *pszUTF8 & 0x3f;
  3149. ++pszUTF8;
  3150. }
  3151. }
  3152. return nUChar;
  3153. }
  3154. void CMarkup::EncodeCharUTF16( int nUChar, unsigned short* pwszUTF16, int& nUTF16Len )
  3155. {
  3156. // Write UTF-16 sequence to pwszUTF16 for Unicode code point nUChar and update nUTF16Len
  3157. // Be sure pwszUTF16 has room for up to 2 wide chars
  3158. if ( nUChar & ~0xffff )
  3159. {
  3160. if ( pwszUTF16 )
  3161. {
  3162. // Surrogate pair
  3163. nUChar -= 0x10000;
  3164. pwszUTF16[nUTF16Len++] = (unsigned short)(((nUChar>>10) & 0x3ff) | 0xd800); // W1
  3165. pwszUTF16[nUTF16Len++] = (unsigned short)((nUChar & 0x3ff) | 0xdc00); // W2
  3166. }
  3167. else
  3168. nUTF16Len += 2;
  3169. }
  3170. else
  3171. {
  3172. if ( pwszUTF16 )
  3173. pwszUTF16[nUTF16Len++] = (unsigned short)nUChar;
  3174. else
  3175. ++nUTF16Len;
  3176. }
  3177. }
  3178. int CMarkup::DecodeCharUTF16( const unsigned short*& pwszUTF16, const unsigned short* pszUTF16End/*=NULL*/ )
  3179. {
  3180. // Return Unicode code point and increment pwszUTF16 past 1 or 2 (if surrogrates) UTF-16 code points
  3181. // pszUTF16End can be NULL if pszUTF16 is zero terminated
  3182. int nUChar = *pwszUTF16;
  3183. ++pwszUTF16;
  3184. if ( (nUChar & ~0x000007ff) == 0xd800 ) // W1
  3185. {
  3186. if ( pwszUTF16 == pszUTF16End || ! (*pwszUTF16) ) // W2
  3187. return -1; // incorrect UTF-16
  3188. nUChar = (((nUChar & 0x3ff) << 10) | (*pwszUTF16 & 0x3ff)) + 0x10000;
  3189. ++pwszUTF16;
  3190. }
  3191. return nUChar;
  3192. }
  3193. void CMarkup::EncodeCharUTF8( int nUChar, char* pszUTF8, int& nUTF8Len )
  3194. {
  3195. // Write UTF-8 sequence to pszUTF8 for Unicode code point nUChar and update nUTF8Len
  3196. // Be sure pszUTF8 has room for up to 4 bytes
  3197. if ( ! (nUChar & ~0x0000007f) ) // < 0x80
  3198. {
  3199. if ( pszUTF8 )
  3200. pszUTF8[nUTF8Len++] = (char)nUChar;
  3201. else
  3202. ++nUTF8Len;
  3203. }
  3204. else if ( ! (nUChar & ~0x000007ff) ) // < 0x800
  3205. {
  3206. if ( pszUTF8 )
  3207. {
  3208. pszUTF8[nUTF8Len++] = (char)(((nUChar&0x7c0)>>6)|0xc0);
  3209. pszUTF8[nUTF8Len++] = (char)((nUChar&0x3f)|0x80);
  3210. }
  3211. else
  3212. nUTF8Len += 2;
  3213. }
  3214. else if ( ! (nUChar & ~0x0000ffff) ) // < 0x10000
  3215. {
  3216. if ( pszUTF8 )
  3217. {
  3218. pszUTF8[nUTF8Len++] = (char)(((nUChar&0xf000)>>12)|0xe0);
  3219. pszUTF8[nUTF8Len++] = (char)(((nUChar&0xfc0)>>6)|0x80);
  3220. pszUTF8[nUTF8Len++] = (char)((nUChar&0x3f)|0x80);
  3221. }
  3222. else
  3223. nUTF8Len += 3;
  3224. }
  3225. else // < 0x110000
  3226. {
  3227. if ( pszUTF8 )
  3228. {
  3229. pszUTF8[nUTF8Len++] = (char)(((nUChar&0x1c0000)>>18)|0xf0);
  3230. pszUTF8[nUTF8Len++] = (char)(((nUChar&0x3f000)>>12)|0x80);
  3231. pszUTF8[nUTF8Len++] = (char)(((nUChar&0xfc0)>>6)|0x80);
  3232. pszUTF8[nUTF8Len++] = (char)((nUChar&0x3f)|0x80);
  3233. }
  3234. else
  3235. nUTF8Len += 4;
  3236. }
  3237. }
  3238. int CMarkup::UTF16To8( char* pszUTF8, const unsigned short* pwszUTF16, int nUTF8Count )
  3239. {
  3240. // Supports the same arguments as wcstombs
  3241. // the pwszUTF16 source must be a NULL-terminated UTF-16 string
  3242. // if pszUTF8 is NULL, the number of bytes required is returned and nUTF8Count is ignored
  3243. // otherwise pszUTF8 is filled with the result string and NULL-terminated if nUTF8Count allows
  3244. // nUTF8Count is the byte size of pszUTF8 and must be large enough for the NULL if NULL desired
  3245. // and the number of bytes (excluding NULL) is returned
  3246. //
  3247. int nUChar, nUTF8Len = 0;
  3248. while ( *pwszUTF16 )
  3249. {
  3250. // Decode UTF-16
  3251. nUChar = DecodeCharUTF16( pwszUTF16, NULL );
  3252. if ( nUChar == -1 )
  3253. nUChar = '?';
  3254. // Encode UTF-8
  3255. if ( pszUTF8 && nUTF8Len + 4 > nUTF8Count )
  3256. {
  3257. int nUTF8LenSoFar = nUTF8Len;
  3258. EncodeCharUTF8( nUChar, NULL, nUTF8Len );
  3259. if ( nUTF8Len > nUTF8Count )
  3260. return nUTF8LenSoFar;
  3261. nUTF8Len = nUTF8LenSoFar;
  3262. }
  3263. EncodeCharUTF8( nUChar, pszUTF8, nUTF8Len );
  3264. }
  3265. if ( pszUTF8 && nUTF8Len < nUTF8Count )
  3266. pszUTF8[nUTF8Len] = 0;
  3267. return nUTF8Len;
  3268. }
  3269. int CMarkup::UTF8To16( unsigned short* pwszUTF16, const char* pszUTF8, int nUTF8Count )
  3270. {
  3271. // Supports the same arguments as mbstowcs
  3272. // the pszUTF8 source must be a UTF-8 string which will be processed up to NULL-terminator or nUTF8Count
  3273. // if pwszUTF16 is NULL, the number of UTF-16 chars required is returned
  3274. // nUTF8Count is maximum UTF-8 bytes to convert and should include NULL if NULL desired in result
  3275. // if pwszUTF16 is not NULL it is filled with the result string and it must be large enough
  3276. // result will be NULL-terminated if NULL encountered in pszUTF8 before nUTF8Count
  3277. // and the number of UTF-8 bytes converted is returned
  3278. //
  3279. const char* pszPosUTF8 = pszUTF8;
  3280. const char* pszUTF8End = pszUTF8 + nUTF8Count;
  3281. int nUChar, nUTF8Len = 0, nUTF16Len = 0;
  3282. while ( pszPosUTF8 != pszUTF8End )
  3283. {
  3284. nUChar = DecodeCharUTF8( pszPosUTF8, pszUTF8End );
  3285. if ( ! nUChar )
  3286. {
  3287. if ( pwszUTF16 )
  3288. pwszUTF16[nUTF16Len] = 0;
  3289. break;
  3290. }
  3291. else if ( nUChar == -1 )
  3292. nUChar = '?';
  3293. // Encode UTF-16
  3294. EncodeCharUTF16( nUChar, pwszUTF16, nUTF16Len );
  3295. }
  3296. nUTF8Len = (int)(pszPosUTF8 - pszUTF8);
  3297. if ( ! pwszUTF16 )
  3298. return nUTF16Len;
  3299. return nUTF8Len;
  3300. }
  3301. #if ! defined(MARKUP_WCHAR) // not WCHAR
  3302. MCD_STR CMarkup::UTF8ToA( MCD_CSTR pszUTF8, int* pnFailed/*=NULL*/ )
  3303. {
  3304. // Converts from UTF-8 to locale ANSI charset
  3305. MCD_STR strANSI;
  3306. int nMBLen = (int)MCD_PSZLEN( pszUTF8 );
  3307. if ( pnFailed )
  3308. *pnFailed = 0;
  3309. if ( nMBLen )
  3310. {
  3311. TextEncoding textencoding( MCD_T("UTF-8"), (const void*)pszUTF8, nMBLen );
  3312. textencoding.m_nToCount = nMBLen;
  3313. MCD_CHAR* pANSIBuffer = MCD_GETBUFFER(strANSI,textencoding.m_nToCount);
  3314. nMBLen = textencoding.PerformConversion( (void*)pANSIBuffer );
  3315. MCD_RELEASEBUFFER(strANSI,pANSIBuffer,nMBLen);
  3316. if ( pnFailed )
  3317. *pnFailed = textencoding.m_nFailedChars;
  3318. }
  3319. return strANSI;
  3320. }
  3321. MCD_STR CMarkup::AToUTF8( MCD_CSTR pszANSI )
  3322. {
  3323. // Converts locale ANSI charset to UTF-8
  3324. MCD_STR strUTF8;
  3325. int nMBLen = (int)MCD_PSZLEN( pszANSI );
  3326. if ( nMBLen )
  3327. {
  3328. TextEncoding textencoding( MCD_T(""), (const void*)pszANSI, nMBLen );
  3329. textencoding.m_nToCount = nMBLen * 4;
  3330. MCD_CHAR* pUTF8Buffer = MCD_GETBUFFER(strUTF8,textencoding.m_nToCount);
  3331. nMBLen = textencoding.PerformConversion( (void*)pUTF8Buffer, MCD_T("UTF-8") );
  3332. MCD_RELEASEBUFFER(strUTF8,pUTF8Buffer,nMBLen);
  3333. }
  3334. return strUTF8;
  3335. }
  3336. #endif // not WCHAR
  3337. MCD_STR CMarkup::GetDeclaredEncoding( MCD_CSTR szDoc )
  3338. {
  3339. // Extract encoding attribute from XML Declaration, or HTML meta charset
  3340. MCD_STR strEncoding;
  3341. TokenPos token( szDoc, MDF_IGNORECASE );
  3342. NodePos node;
  3343. bool bHtml = false;
  3344. int nTypeFound = 0;
  3345. while ( nTypeFound >= 0 )
  3346. {
  3347. nTypeFound = token.ParseNode( node );
  3348. int nNext = token.m_nNext;
  3349. if ( nTypeFound == MNT_PROCESSING_INSTRUCTION && node.nStart == 0 )
  3350. {
  3351. token.m_nNext = node.nStart + 2; // after <?
  3352. if ( token.FindName() && token.Match(MCD_T("xml")) )
  3353. {
  3354. // e.g. <?xml version="1.0" encoding="UTF-8"?>
  3355. if ( token.FindAttrib(MCD_T("encoding")) )
  3356. strEncoding = token.GetTokenText();
  3357. break;
  3358. }
  3359. }
  3360. else if ( nTypeFound == 0 ) // end tag
  3361. {
  3362. // Check for end of HTML head
  3363. token.m_nNext = node.nStart + 2; // after </
  3364. if ( token.FindName() && token.Match(MCD_T("head")) )
  3365. break;
  3366. }
  3367. else if ( nTypeFound == MNT_ELEMENT )
  3368. {
  3369. token.m_nNext = node.nStart + 1; // after <
  3370. token.FindName();
  3371. if ( ! bHtml )
  3372. {
  3373. if ( ! token.Match(MCD_T("html")) )
  3374. break;
  3375. bHtml = true;
  3376. }
  3377. else if ( token.Match(MCD_T("meta")) )
  3378. {
  3379. // e.g. <META http-equiv=Content-Type content="text/html; charset=UTF-8">
  3380. int nAttribOffset = node.nStart + 1;
  3381. token.m_nNext = nAttribOffset;
  3382. if ( token.FindAttrib(MCD_T("http-equiv")) && token.Match(MCD_T("Content-Type")) )
  3383. {
  3384. token.m_nNext = nAttribOffset;
  3385. if ( token.FindAttrib(MCD_T("content")) )
  3386. {
  3387. int nContentEndOffset = token.m_nNext;
  3388. token.m_nNext = token.m_nL;
  3389. while ( token.m_nNext < nContentEndOffset && token.FindName() )
  3390. {
  3391. if ( token.Match(MCD_T("charset")) && token.FindName() && token.Match(MCD_T("=")) )
  3392. {
  3393. token.FindName();
  3394. strEncoding = token.GetTokenText();
  3395. break;
  3396. }
  3397. }
  3398. }
  3399. break;
  3400. }
  3401. }
  3402. }
  3403. token.m_nNext = nNext;
  3404. }
  3405. return strEncoding;
  3406. }
  3407. int CMarkup::GetEncodingCodePage( MCD_CSTR pszEncoding )
  3408. {
  3409. return x_GetEncodingCodePage( pszEncoding );
  3410. }
  3411. int CMarkup::FindNode( int nType )
  3412. {
  3413. // Change current node position only if a node is found
  3414. // If nType is 0 find any node, otherwise find node of type nType
  3415. // Return type of node or 0 if not found
  3416. // Determine where in document to start scanning for node
  3417. int nNodeOffset = m_nNodeOffset;
  3418. if ( m_nNodeType > MNT_ELEMENT )
  3419. {
  3420. // By-pass current node
  3421. nNodeOffset += m_nNodeLength;
  3422. }
  3423. else // element or no current main position
  3424. {
  3425. // Set position to begin looking for node
  3426. if ( m_iPos )
  3427. {
  3428. // After element
  3429. nNodeOffset = ELEM(m_iPos).StartAfter();
  3430. }
  3431. else if ( m_iPosParent )
  3432. {
  3433. // Immediately after start tag of parent
  3434. if ( ELEM(m_iPosParent).IsEmptyElement() )
  3435. return 0;
  3436. else
  3437. nNodeOffset = ELEM(m_iPosParent).StartContent();
  3438. }
  3439. }
  3440. // Get nodes until we find what we're looking for
  3441. int nTypeFound = 0;
  3442. int iPosNew = m_iPos;
  3443. TokenPos token( m_strDoc, m_nDocFlags );
  3444. NodePos node;
  3445. token.m_nNext = nNodeOffset;
  3446. do
  3447. {
  3448. nNodeOffset = token.m_nNext;
  3449. nTypeFound = token.ParseNode( node );
  3450. if ( nTypeFound == 0 )
  3451. {
  3452. // Check if we have reached the end of the parent element
  3453. if ( m_iPosParent && nNodeOffset == ELEM(m_iPosParent).StartContent()
  3454. + ELEM(m_iPosParent).ContentLen() )
  3455. return 0;
  3456. nTypeFound = MNT_LONE_END_TAG; // otherwise it is a lone end tag
  3457. }
  3458. else if ( nTypeFound < 0 )
  3459. {
  3460. if ( nTypeFound == -2 ) // end of document
  3461. return 0;
  3462. // -1 is node error
  3463. nTypeFound = MNT_NODE_ERROR;
  3464. }
  3465. else if ( nTypeFound == MNT_ELEMENT )
  3466. {
  3467. if ( iPosNew )
  3468. iPosNew = ELEM(iPosNew).iElemNext;
  3469. else
  3470. iPosNew = ELEM(m_iPosParent).iElemChild;
  3471. if ( ! iPosNew )
  3472. return 0;
  3473. if ( ! nType || (nType & nTypeFound) )
  3474. {
  3475. // Found element node, move position to this element
  3476. x_SetPos( m_iPosParent, iPosNew, 0 );
  3477. return m_nNodeType;
  3478. }
  3479. token.m_nNext = ELEM(iPosNew).StartAfter();
  3480. }
  3481. }
  3482. while ( nType && ! (nType & nTypeFound) );
  3483. m_iPos = iPosNew;
  3484. m_iPosChild = 0;
  3485. m_nNodeOffset = node.nStart;
  3486. m_nNodeLength = node.nLength;
  3487. m_nNodeType = nTypeFound;
  3488. MARKUP_SETDEBUGSTATE;
  3489. return m_nNodeType;
  3490. }
  3491. bool CMarkup::RemoveNode()
  3492. {
  3493. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  3494. return false;
  3495. if ( m_iPos || m_nNodeLength )
  3496. {
  3497. x_RemoveNode( m_iPosParent, m_iPos, m_nNodeType, m_nNodeOffset, m_nNodeLength );
  3498. m_iPosChild = 0;
  3499. MARKUP_SETDEBUGSTATE;
  3500. return true;
  3501. }
  3502. return false;
  3503. }
  3504. MCD_STR CMarkup::GetTagName() const
  3505. {
  3506. // Return the tag name at the current main position
  3507. MCD_STR strTagName;
  3508. // This method is primarily for elements, however
  3509. // it does return something for certain other nodes
  3510. if ( m_nNodeLength )
  3511. {
  3512. switch ( m_nNodeType )
  3513. {
  3514. case MNT_PROCESSING_INSTRUCTION:
  3515. case MNT_LONE_END_TAG:
  3516. {
  3517. // <?target or </tagname
  3518. TokenPos token( m_strDoc, m_nDocFlags );
  3519. token.m_nNext = m_nNodeOffset + 2;
  3520. if ( token.FindName() )
  3521. strTagName = token.GetTokenText();
  3522. }
  3523. break;
  3524. case MNT_COMMENT:
  3525. strTagName = MCD_T("#comment");
  3526. break;
  3527. case MNT_CDATA_SECTION:
  3528. strTagName = MCD_T("#cdata-section");
  3529. break;
  3530. case MNT_DOCUMENT_TYPE:
  3531. {
  3532. // <!DOCTYPE name
  3533. TokenPos token( m_strDoc, m_nDocFlags );
  3534. token.m_nNext = m_nNodeOffset + 2;
  3535. if ( token.FindName() && token.FindName() )
  3536. strTagName = token.GetTokenText();
  3537. }
  3538. break;
  3539. case MNT_TEXT:
  3540. case MNT_WHITESPACE:
  3541. strTagName = MCD_T("#text");
  3542. break;
  3543. }
  3544. return strTagName;
  3545. }
  3546. if ( m_iPos )
  3547. strTagName = x_GetTagName( m_iPos );
  3548. return strTagName;
  3549. }
  3550. bool CMarkup::IntoElem()
  3551. {
  3552. // Make current element the parent
  3553. if ( m_iPos && m_nNodeType == MNT_ELEMENT )
  3554. {
  3555. x_SetPos( m_iPos, m_iPosChild, 0 );
  3556. return true;
  3557. }
  3558. return false;
  3559. }
  3560. bool CMarkup::OutOfElem()
  3561. {
  3562. // Go to parent element
  3563. if ( m_iPosParent )
  3564. {
  3565. x_SetPos( ELEM(m_iPosParent).iElemParent, m_iPosParent, m_iPos );
  3566. return true;
  3567. }
  3568. return false;
  3569. }
  3570. bool CMarkup::GetNthAttrib( int n, MCD_STR& strAttrib, MCD_STR& strValue ) const
  3571. {
  3572. // Return nth attribute name and value from main position
  3573. TokenPos token( m_strDoc, m_nDocFlags );
  3574. if ( m_iPos && m_nNodeType == MNT_ELEMENT )
  3575. token.m_nNext = ELEM(m_iPos).nStart + 1;
  3576. else if ( m_nNodeLength && m_nNodeType == MNT_PROCESSING_INSTRUCTION )
  3577. token.m_nNext = m_nNodeOffset + 2;
  3578. else
  3579. return false;
  3580. if ( token.FindAttrib(NULL,n,&strAttrib) )
  3581. {
  3582. strValue = UnescapeText( token.GetTokenPtr(), token.Length(), m_nDocFlags );
  3583. return true;
  3584. }
  3585. return false;
  3586. }
  3587. MCD_STR CMarkup::GetAttribName( int n ) const
  3588. {
  3589. // Return nth attribute name of main position
  3590. TokenPos token( m_strDoc, m_nDocFlags );
  3591. if ( m_iPos && m_nNodeType == MNT_ELEMENT )
  3592. token.m_nNext = ELEM(m_iPos).nStart + 1;
  3593. else if ( m_nNodeLength && m_nNodeType == MNT_PROCESSING_INSTRUCTION )
  3594. token.m_nNext = m_nNodeOffset + 2;
  3595. else
  3596. return MCD_T("");
  3597. if ( token.FindAttrib(NULL,n) )
  3598. return token.GetTokenText();
  3599. return MCD_T("");
  3600. }
  3601. bool CMarkup::SavePos( MCD_CSTR szPosName /*=""*/, int nMap /*=0*/ )
  3602. {
  3603. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  3604. return false;
  3605. // Save current element position in saved position map
  3606. if ( szPosName )
  3607. {
  3608. SavedPosMap* pMap;
  3609. m_pSavedPosMaps->GetMap( pMap, nMap );
  3610. SavedPos savedpos;
  3611. if ( szPosName )
  3612. savedpos.strName = szPosName;
  3613. if ( m_iPosChild )
  3614. {
  3615. savedpos.iPos = m_iPosChild;
  3616. savedpos.nSavedPosFlags |= SavedPos::SPM_CHILD;
  3617. }
  3618. else if ( m_iPos )
  3619. {
  3620. savedpos.iPos = m_iPos;
  3621. savedpos.nSavedPosFlags |= SavedPos::SPM_MAIN;
  3622. }
  3623. else
  3624. {
  3625. savedpos.iPos = m_iPosParent;
  3626. }
  3627. savedpos.nSavedPosFlags |= SavedPos::SPM_USED;
  3628. int nSlot = x_Hash( szPosName, pMap->nMapSize);
  3629. SavedPos* pSavedPos = pMap->pTable[nSlot];
  3630. int nOffset = 0;
  3631. if ( ! pSavedPos )
  3632. {
  3633. pSavedPos = new SavedPos[2];
  3634. pSavedPos[1].nSavedPosFlags = SavedPos::SPM_LAST;
  3635. pMap->pTable[nSlot] = pSavedPos;
  3636. }
  3637. else
  3638. {
  3639. while ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_USED )
  3640. {
  3641. if ( pSavedPos[nOffset].strName == (MCD_PCSZ)szPosName )
  3642. break;
  3643. if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_LAST )
  3644. {
  3645. int nNewSize = (nOffset + 6) * 2;
  3646. SavedPos* pNewSavedPos = new SavedPos[nNewSize];
  3647. for ( int nCopy=0; nCopy<=nOffset; ++nCopy )
  3648. pNewSavedPos[nCopy] = pSavedPos[nCopy];
  3649. pNewSavedPos[nOffset].nSavedPosFlags ^= SavedPos::SPM_LAST;
  3650. pNewSavedPos[nNewSize-1].nSavedPosFlags = SavedPos::SPM_LAST;
  3651. delete [] pSavedPos;
  3652. pSavedPos = pNewSavedPos;
  3653. pMap->pTable[nSlot] = pSavedPos;
  3654. ++nOffset;
  3655. break;
  3656. }
  3657. ++nOffset;
  3658. }
  3659. }
  3660. if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_LAST )
  3661. savedpos.nSavedPosFlags |= SavedPos::SPM_LAST;
  3662. pSavedPos[nOffset] = savedpos;
  3663. /*
  3664. // To review hash table balance, uncomment and watch strBalance
  3665. MCD_STR strBalance, strSlot;
  3666. for ( nSlot=0; nSlot < pMap->nMapSize; ++nSlot )
  3667. {
  3668. pSavedPos = pMap->pTable[nSlot];
  3669. int nCount = 0;
  3670. while ( pSavedPos && pSavedPos->nSavedPosFlags & SavedPos::SPM_USED )
  3671. {
  3672. ++nCount;
  3673. if ( pSavedPos->nSavedPosFlags & SavedPos::SPM_LAST )
  3674. break;
  3675. ++pSavedPos;
  3676. }
  3677. strSlot.Format( MCD_T("%d "), nCount );
  3678. strBalance += strSlot;
  3679. }
  3680. */
  3681. return true;
  3682. }
  3683. return false;
  3684. }
  3685. bool CMarkup::RestorePos( MCD_CSTR szPosName /*=""*/, int nMap /*=0*/ )
  3686. {
  3687. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  3688. return false;
  3689. // Restore element position if found in saved position map
  3690. if ( szPosName )
  3691. {
  3692. SavedPosMap* pMap;
  3693. m_pSavedPosMaps->GetMap( pMap, nMap );
  3694. int nSlot = x_Hash( szPosName, pMap->nMapSize );
  3695. SavedPos* pSavedPos = pMap->pTable[nSlot];
  3696. if ( pSavedPos )
  3697. {
  3698. int nOffset = 0;
  3699. while ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_USED )
  3700. {
  3701. if ( pSavedPos[nOffset].strName == (MCD_PCSZ)szPosName )
  3702. {
  3703. int i = pSavedPos[nOffset].iPos;
  3704. if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_CHILD )
  3705. x_SetPos( ELEM(ELEM(i).iElemParent).iElemParent, ELEM(i).iElemParent, i );
  3706. else if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_MAIN )
  3707. x_SetPos( ELEM(i).iElemParent, i, 0 );
  3708. else
  3709. x_SetPos( i, 0, 0 );
  3710. return true;
  3711. }
  3712. if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_LAST )
  3713. break;
  3714. ++nOffset;
  3715. }
  3716. }
  3717. }
  3718. return false;
  3719. }
  3720. bool CMarkup::SetMapSize( int nSize, int nMap /*=0*/ )
  3721. {
  3722. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  3723. return false;
  3724. // Set saved position map hash table size before using it
  3725. // Returns false if map already exists
  3726. // Some prime numbers: 53, 101, 211, 503, 1009, 2003, 10007, 20011, 50021, 100003, 200003, 500009
  3727. SavedPosMap* pNewMap;
  3728. return m_pSavedPosMaps->GetMap( pNewMap, nMap, nSize );
  3729. }
  3730. bool CMarkup::RemoveElem()
  3731. {
  3732. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  3733. return false;
  3734. // Remove current main position element
  3735. if ( m_iPos && m_nNodeType == MNT_ELEMENT )
  3736. {
  3737. int iPos = x_RemoveElem( m_iPos );
  3738. x_SetPos( m_iPosParent, iPos, 0 );
  3739. return true;
  3740. }
  3741. return false;
  3742. }
  3743. bool CMarkup::RemoveChildElem()
  3744. {
  3745. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  3746. return false;
  3747. // Remove current child position element
  3748. if ( m_iPosChild )
  3749. {
  3750. int iPosChild = x_RemoveElem( m_iPosChild );
  3751. x_SetPos( m_iPosParent, m_iPos, iPosChild );
  3752. return true;
  3753. }
  3754. return false;
  3755. }
  3756. //////////////////////////////////////////////////////////////////////
  3757. // CMarkup private methods
  3758. //
  3759. void CMarkup::x_InitMarkup()
  3760. {
  3761. // Only called from CMarkup constructors
  3762. m_pFilePos = NULL;
  3763. m_pSavedPosMaps = new SavedPosMapArray;
  3764. m_pElemPosTree = new ElemPosTree;
  3765. // To always ignore case, define MARKUP_IGNORECASE
  3766. #if defined(MARKUP_IGNORECASE) // ignore case
  3767. m_nDocFlags = MDF_IGNORECASE;
  3768. #else // not ignore case
  3769. m_nDocFlags = 0;
  3770. #endif // not ignore case
  3771. }
  3772. int CMarkup::x_GetParent( int i )
  3773. {
  3774. return ELEM(i).iElemParent;
  3775. }
  3776. void CMarkup::x_SetPos( int iPosParent, int iPos, int iPosChild )
  3777. {
  3778. m_iPosParent = iPosParent;
  3779. m_iPos = iPos;
  3780. m_iPosChild = iPosChild;
  3781. m_nNodeOffset = 0;
  3782. m_nNodeLength = 0;
  3783. m_nNodeType = iPos?MNT_ELEMENT:0;
  3784. MARKUP_SETDEBUGSTATE;
  3785. }
  3786. #if defined(_DEBUG) // DEBUG
  3787. void CMarkup::x_SetDebugState()
  3788. {
  3789. // Set m_pDebugCur and m_pDebugPos to point into document
  3790. MCD_PCSZ pD = MCD_2PCSZ(m_strDoc);
  3791. // Node (non-element) position is determined differently in file mode
  3792. if ( m_nNodeLength || (m_nNodeOffset && !m_pFilePos)
  3793. || (m_pFilePos && (!m_iPos) && (!m_iPosParent) && ! m_pFilePos->FileAtTop()) )
  3794. {
  3795. if ( ! m_nNodeLength )
  3796. m_pDebugCur = MCD_T("main position offset"); // file mode only
  3797. else
  3798. m_pDebugCur = MCD_T("main position node");
  3799. m_pDebugPos = &pD[m_nNodeOffset];
  3800. }
  3801. else
  3802. {
  3803. if ( m_iPosChild )
  3804. {
  3805. m_pDebugCur = MCD_T("child position element");
  3806. m_pDebugPos = &pD[ELEM(m_iPosChild).nStart];
  3807. }
  3808. else if ( m_iPos )
  3809. {
  3810. m_pDebugCur = MCD_T("main position element");
  3811. m_pDebugPos = &pD[ELEM(m_iPos).nStart];
  3812. }
  3813. else if ( m_iPosParent )
  3814. {
  3815. m_pDebugCur = MCD_T("parent position element");
  3816. m_pDebugPos = &pD[ELEM(m_iPosParent).nStart];
  3817. }
  3818. else
  3819. {
  3820. m_pDebugCur = MCD_T("top of document");
  3821. m_pDebugPos = pD;
  3822. }
  3823. }
  3824. }
  3825. #endif // DEBUG
  3826. int CMarkup::x_GetFreePos()
  3827. {
  3828. if ( m_iPosFree == m_pElemPosTree->GetSize() )
  3829. x_AllocElemPos();
  3830. return m_iPosFree++;
  3831. }
  3832. bool CMarkup::x_AllocElemPos( int nNewSize /*=0*/ )
  3833. {
  3834. // Resize m_aPos when the document is created or the array is filled
  3835. if ( ! nNewSize )
  3836. nNewSize = m_iPosFree + (m_iPosFree>>1); // Grow By: multiply size by 1.5
  3837. if ( m_pElemPosTree->GetSize() < nNewSize )
  3838. m_pElemPosTree->GrowElemPosTree( nNewSize );
  3839. return true;
  3840. }
  3841. bool CMarkup::x_ParseDoc()
  3842. {
  3843. // Reset indexes
  3844. ResetPos();
  3845. m_pSavedPosMaps->ReleaseMaps();
  3846. // Starting size of position array: 1 element per 64 bytes of document
  3847. // Tight fit when parsing small doc, only 0 to 2 reallocs when parsing large doc
  3848. // Start at 8 when creating new document
  3849. int nDocLen = MCD_STRLENGTH(m_strDoc);
  3850. m_iPosFree = 1;
  3851. x_AllocElemPos( nDocLen / 64 + 8 );
  3852. m_iPosDeleted = 0;
  3853. // Parse document
  3854. ELEM(0).ClearVirtualParent();
  3855. if ( nDocLen )
  3856. {
  3857. TokenPos token( m_strDoc, m_nDocFlags );
  3858. int iPos = x_ParseElem( 0, token );
  3859. ELEM(0).nLength = nDocLen;
  3860. if ( iPos > 0 )
  3861. {
  3862. ELEM(0).iElemChild = iPos;
  3863. if ( ELEM(iPos).iElemNext )
  3864. x_AddResult( m_strResult, MCD_T("root_has_sibling") );
  3865. }
  3866. else
  3867. x_AddResult( m_strResult, MCD_T("no_root_element") );
  3868. }
  3869. ResetPos();
  3870. return IsWellFormed();
  3871. }
  3872. int CMarkup::x_ParseElem( int iPosParent, TokenPos& token )
  3873. {
  3874. // This is either called by x_ParseDoc or x_AddSubDoc or x_SetElemContent
  3875. // Returns index of the first element encountered or zero if no elements
  3876. //
  3877. int iPosRoot = 0;
  3878. int iPos = iPosParent;
  3879. int iVirtualParent = iPosParent;
  3880. int nRootDepth = ELEM(iPos).Level();
  3881. int nMatchLevel;
  3882. int iPosMatch;
  3883. int iTag;
  3884. int nTypeFound;
  3885. int iPosFirst;
  3886. int iPosLast;
  3887. ElemPos* pElem;
  3888. ElemPos* pElemParent;
  3889. ElemPos* pElemChild;
  3890. // Loop through the nodes of the document
  3891. ElemStack elemstack;
  3892. NodePos node;
  3893. token.m_nNext = 0;
  3894. while ( 1 )
  3895. {
  3896. nTypeFound = token.ParseNode( node );
  3897. nMatchLevel = 0;
  3898. if ( nTypeFound == MNT_ELEMENT ) // start tag
  3899. {
  3900. iPos = x_GetFreePos();
  3901. if ( ! iPosRoot )
  3902. iPosRoot = iPos;
  3903. pElem = &ELEM(iPos);
  3904. pElem->iElemParent = iPosParent;
  3905. pElem->iElemNext = 0;
  3906. pElemParent = &ELEM(iPosParent);
  3907. if ( pElemParent->iElemChild )
  3908. {
  3909. iPosFirst = pElemParent->iElemChild;
  3910. pElemChild = &ELEM(iPosFirst);
  3911. iPosLast = pElemChild->iElemPrev;
  3912. ELEM(iPosLast).iElemNext = iPos;
  3913. pElem->iElemPrev = iPosLast;
  3914. pElemChild->iElemPrev = iPos;
  3915. pElem->nFlags = 0;
  3916. }
  3917. else
  3918. {
  3919. pElemParent->iElemChild = iPos;
  3920. pElem->iElemPrev = iPos;
  3921. pElem->nFlags = MNF_FIRST;
  3922. }
  3923. pElem->SetLevel( nRootDepth + elemstack.iTop );
  3924. pElem->iElemChild = 0;
  3925. pElem->nStart = node.nStart;
  3926. pElem->SetStartTagLen( node.nLength );
  3927. if ( node.nNodeFlags & MNF_EMPTY )
  3928. {
  3929. iPos = iPosParent;
  3930. pElem->SetEndTagLen( 0 );
  3931. pElem->nLength = node.nLength;
  3932. }
  3933. else
  3934. {
  3935. iPosParent = iPos;
  3936. elemstack.PushIntoLevel( token.GetTokenPtr(), token.Length() );
  3937. }
  3938. }
  3939. else if ( nTypeFound == 0 ) // end tag
  3940. {
  3941. iPosMatch = iPos;
  3942. iTag = elemstack.iTop;
  3943. nMatchLevel = iTag;
  3944. while ( nMatchLevel && ! token.Match(elemstack.GetRefTagPosAt(iTag--).strTagName) )
  3945. {
  3946. --nMatchLevel;
  3947. iPosMatch = ELEM(iPosMatch).iElemParent;
  3948. }
  3949. if ( nMatchLevel == 0 )
  3950. {
  3951. // Not matched at all, it is a lone end tag, a non-element node
  3952. ELEM(iVirtualParent).nFlags |= MNF_ILLFORMED;
  3953. ELEM(iPos).nFlags |= MNF_ILLDATA;
  3954. x_AddResult( m_strResult, MCD_T("lone_end_tag"), token.GetTokenText(), 0, node.nStart );
  3955. }
  3956. else
  3957. {
  3958. pElem = &ELEM(iPosMatch);
  3959. pElem->nLength = node.nStart - pElem->nStart + node.nLength;
  3960. pElem->SetEndTagLen( node.nLength );
  3961. }
  3962. }
  3963. else if ( nTypeFound == -1 )
  3964. {
  3965. ELEM(iVirtualParent).nFlags |= MNF_ILLFORMED;
  3966. ELEM(iPos).nFlags |= MNF_ILLDATA;
  3967. m_strResult += node.strMeta;
  3968. }
  3969. // Matched end tag, or end of document
  3970. if ( nMatchLevel || nTypeFound == -2 )
  3971. {
  3972. if ( elemstack.iTop > nMatchLevel )
  3973. ELEM(iVirtualParent).nFlags |= MNF_ILLFORMED;
  3974. // Process any non-ended elements
  3975. while ( elemstack.iTop > nMatchLevel )
  3976. {
  3977. // Element with no end tag
  3978. pElem = &ELEM(iPos);
  3979. int iPosChild = pElem->iElemChild;
  3980. iPosParent = pElem->iElemParent;
  3981. pElem->SetEndTagLen( 0 );
  3982. pElem->nFlags |= MNF_NONENDED;
  3983. pElem->iElemChild = 0;
  3984. pElem->nLength = pElem->StartTagLen();
  3985. if ( pElem->nFlags & MNF_ILLDATA )
  3986. {
  3987. pElem->nFlags ^= MNF_ILLDATA;
  3988. ELEM(iPosParent).nFlags |= MNF_ILLDATA;
  3989. }
  3990. while ( iPosChild )
  3991. {
  3992. ELEM(iPosChild).iElemParent = iPosParent;
  3993. ELEM(iPosChild).iElemPrev = iPos;
  3994. ELEM(iPos).iElemNext = iPosChild;
  3995. iPos = iPosChild;
  3996. iPosChild = ELEM(iPosChild).iElemNext;
  3997. }
  3998. // If end tag did not match, top node is end tag that did not match pElem
  3999. // if end of document, any nodes below top have no end tag
  4000. // second offset represents location where end tag was expected but end of document or other end tag was found
  4001. // end tag that was found is token.GetTokenText() but not reported in error
  4002. int nOffset2 = (nTypeFound==0)? token.m_nL-1: MCD_STRLENGTH(m_strDoc);
  4003. x_AddResult( m_strResult, MCD_T("unended_start_tag"), elemstack.Current().strTagName, 0, pElem->nStart, nOffset2 );
  4004. iPos = iPosParent;
  4005. elemstack.PopOutOfLevel();
  4006. }
  4007. if ( nTypeFound == -2 )
  4008. break;
  4009. iPosParent = ELEM(iPos).iElemParent;
  4010. iPos = iPosParent;
  4011. elemstack.PopOutOfLevel();
  4012. }
  4013. }
  4014. return iPosRoot;
  4015. }
  4016. int CMarkup::x_FindElem( int iPosParent, int iPos, PathPos& path ) const
  4017. {
  4018. // If pPath is NULL or empty, go to next sibling element
  4019. // Otherwise go to next sibling element with matching path
  4020. //
  4021. if ( ! path.ValidPath() )
  4022. return 0;
  4023. // Paths other than simple tag name are only supported in the developer version
  4024. if ( path.IsAnywherePath() || path.IsAbsolutePath() )
  4025. return 0;
  4026. if ( iPos )
  4027. iPos = ELEM(iPos).iElemNext;
  4028. else
  4029. iPos = ELEM(iPosParent).iElemChild;
  4030. // Finished here if pPath not specified
  4031. if ( ! path.IsPath() )
  4032. return iPos;
  4033. // Search
  4034. TokenPos token( m_strDoc, m_nDocFlags );
  4035. while ( iPos )
  4036. {
  4037. // Compare tag name
  4038. token.m_nNext = ELEM(iPos).nStart + 1;
  4039. token.FindName(); // Locate tag name
  4040. if ( token.Match(path.GetPtr()) )
  4041. return iPos;
  4042. iPos = ELEM(iPos).iElemNext;
  4043. }
  4044. return 0;
  4045. }
  4046. MCD_STR CMarkup::x_GetPath( int iPos ) const
  4047. {
  4048. // In file mode, iPos is an index into m_pFilePos->m_elemstack or zero
  4049. MCD_STR strPath;
  4050. while ( iPos )
  4051. {
  4052. MCD_STR strTagName;
  4053. int iPosParent;
  4054. int nCount = 0;
  4055. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  4056. {
  4057. TagPos& tag = m_pFilePos->m_elemstack.GetRefTagPosAt(iPos);
  4058. strTagName = tag.strTagName;
  4059. nCount = tag.nCount;
  4060. iPosParent = tag.iParent;
  4061. }
  4062. else
  4063. {
  4064. strTagName = x_GetTagName( iPos );
  4065. PathPos path( MCD_2PCSZ(strTagName), false );
  4066. iPosParent = ELEM(iPos).iElemParent;
  4067. int iPosSib = 0;
  4068. while ( iPosSib != iPos )
  4069. {
  4070. path.RevertOffset();
  4071. iPosSib = x_FindElem( iPosParent, iPosSib, path );
  4072. ++nCount;
  4073. }
  4074. }
  4075. if ( nCount == 1 )
  4076. strPath = MCD_T("/") + strTagName + strPath;
  4077. else
  4078. {
  4079. MCD_CHAR szPred[25];
  4080. MCD_SPRINTF( MCD_SSZ(szPred), MCD_T("[%d]"), nCount );
  4081. strPath = MCD_T("/") + strTagName + szPred + strPath;
  4082. }
  4083. iPos = iPosParent;
  4084. }
  4085. return strPath;
  4086. }
  4087. MCD_STR CMarkup::x_GetTagName( int iPos ) const
  4088. {
  4089. // Return the tag name at specified element
  4090. TokenPos token( m_strDoc, m_nDocFlags );
  4091. token.m_nNext = ELEM(iPos).nStart + 1;
  4092. if ( ! iPos || ! token.FindName() )
  4093. return MCD_T("");
  4094. // Return substring of document
  4095. return token.GetTokenText();
  4096. }
  4097. MCD_STR CMarkup::x_GetAttrib( int iPos, MCD_PCSZ pAttrib ) const
  4098. {
  4099. // Return the value of the attrib
  4100. TokenPos token( m_strDoc, m_nDocFlags );
  4101. if ( iPos && m_nNodeType == MNT_ELEMENT )
  4102. token.m_nNext = ELEM(iPos).nStart + 1;
  4103. else if ( iPos == m_iPos && m_nNodeLength && m_nNodeType == MNT_PROCESSING_INSTRUCTION )
  4104. token.m_nNext = m_nNodeOffset + 2;
  4105. else
  4106. return MCD_T("");
  4107. if ( pAttrib && token.FindAttrib(pAttrib) )
  4108. return UnescapeText( token.GetTokenPtr(), token.Length(), m_nDocFlags );
  4109. return MCD_T("");
  4110. }
  4111. bool CMarkup::x_SetAttrib( int iPos, MCD_PCSZ pAttrib, int nValue, int nFlags /*=0*/ )
  4112. {
  4113. // Convert integer to string
  4114. MCD_CHAR szVal[25];
  4115. MCD_SPRINTF( MCD_SSZ(szVal), MCD_T("%d"), nValue );
  4116. return x_SetAttrib( iPos, pAttrib, szVal, nFlags );
  4117. }
  4118. bool CMarkup::x_SetAttrib( int iPos, MCD_PCSZ pAttrib, MCD_PCSZ pValue, int nFlags /*=0*/ )
  4119. {
  4120. if ( m_nDocFlags & MDF_READFILE )
  4121. return false;
  4122. int nNodeStart = 0;
  4123. if ( iPos && m_nNodeType == MNT_ELEMENT )
  4124. nNodeStart = ELEM(iPos).nStart;
  4125. else if ( iPos == m_iPos && m_nNodeLength && m_nNodeType == MNT_PROCESSING_INSTRUCTION )
  4126. nNodeStart = m_nNodeOffset;
  4127. else
  4128. return false;
  4129. // Create insertion text depending on whether attribute already exists
  4130. // Decision: for empty value leaving attrib="" instead of removing attrib
  4131. TokenPos token( m_strDoc, m_nDocFlags );
  4132. token.m_nNext = nNodeStart + ((m_nNodeType == MNT_ELEMENT)?1:2);
  4133. int nReplace = 0;
  4134. int nInsertAt;
  4135. MCD_STR strEscapedValue = EscapeText( pValue, MNF_ESCAPEQUOTES|nFlags );
  4136. int nEscapedValueLen = MCD_STRLENGTH( strEscapedValue );
  4137. MCD_STR strInsert;
  4138. if ( token.FindAttrib(pAttrib) )
  4139. {
  4140. // Replace value
  4141. MCD_BLDRESERVE( strInsert, nEscapedValueLen + 2 );
  4142. MCD_BLDAPPEND1( strInsert, x_ATTRIBQUOTE );
  4143. MCD_BLDAPPENDN( strInsert, MCD_2PCSZ(strEscapedValue), nEscapedValueLen );
  4144. MCD_BLDAPPEND1( strInsert, x_ATTRIBQUOTE );
  4145. MCD_BLDRELEASE( strInsert );
  4146. nInsertAt = token.m_nL - ((token.m_nTokenFlags&MNF_QUOTED)?1:0);
  4147. nReplace = token.Length() + ((token.m_nTokenFlags&MNF_QUOTED)?2:0);
  4148. }
  4149. else
  4150. {
  4151. // Insert string name value pair
  4152. int nAttribNameLen = MCD_PSZLEN( pAttrib );
  4153. MCD_BLDRESERVE( strInsert, nAttribNameLen + nEscapedValueLen + 4 );
  4154. MCD_BLDAPPEND1( strInsert, ' ' );
  4155. MCD_BLDAPPENDN( strInsert, pAttrib, nAttribNameLen );
  4156. MCD_BLDAPPEND1( strInsert, '=' );
  4157. MCD_BLDAPPEND1( strInsert, x_ATTRIBQUOTE );
  4158. MCD_BLDAPPENDN( strInsert, MCD_2PCSZ(strEscapedValue), nEscapedValueLen );
  4159. MCD_BLDAPPEND1( strInsert, x_ATTRIBQUOTE );
  4160. MCD_BLDRELEASE( strInsert );
  4161. nInsertAt = token.m_nNext;
  4162. }
  4163. int nAdjust = MCD_STRLENGTH(strInsert) - nReplace;
  4164. if ( m_nDocFlags & MDF_WRITEFILE )
  4165. {
  4166. int nNewDocLength = MCD_STRLENGTH(m_strDoc) + nAdjust;
  4167. MCD_STRCLEAR( m_strResult );
  4168. if ( nNodeStart && nNewDocLength > m_pFilePos->m_nBlockSizeBasis )
  4169. {
  4170. int nDocCapacity = MCD_STRCAPACITY(m_strDoc);
  4171. if ( nNewDocLength > nDocCapacity )
  4172. {
  4173. m_pFilePos->FileFlush( *m_pFilePos->m_pstrBuffer, nNodeStart );
  4174. m_strResult = m_pFilePos->m_strIOResult;
  4175. nInsertAt -= nNodeStart;
  4176. m_nNodeOffset = 0;
  4177. if ( m_nNodeType == MNT_ELEMENT )
  4178. ELEM(iPos).nStart = 0;
  4179. }
  4180. }
  4181. }
  4182. x_DocChange( nInsertAt, nReplace, strInsert );
  4183. if ( m_nNodeType == MNT_PROCESSING_INSTRUCTION )
  4184. {
  4185. x_AdjustForNode( m_iPosParent, m_iPos, nAdjust );
  4186. m_nNodeLength += nAdjust;
  4187. }
  4188. else
  4189. {
  4190. ELEM(iPos).AdjustStartTagLen( nAdjust );
  4191. ELEM(iPos).nLength += nAdjust;
  4192. x_Adjust( iPos, nAdjust );
  4193. }
  4194. MARKUP_SETDEBUGSTATE;
  4195. return true;
  4196. }
  4197. bool CMarkup::x_CreateNode( MCD_STR& strNode, int nNodeType, MCD_PCSZ pText )
  4198. {
  4199. // Set strNode based on nNodeType and szData
  4200. // Return false if szData would jeopardize well-formed document
  4201. //
  4202. switch ( nNodeType )
  4203. {
  4204. case MNT_PROCESSING_INSTRUCTION:
  4205. strNode = MCD_T("<?");
  4206. strNode += pText;
  4207. strNode += MCD_T("?>");
  4208. break;
  4209. case MNT_COMMENT:
  4210. strNode = MCD_T("<!--");
  4211. strNode += pText;
  4212. strNode += MCD_T("-->");
  4213. break;
  4214. case MNT_ELEMENT:
  4215. strNode = MCD_T("<");
  4216. strNode += pText;
  4217. strNode += MCD_T("/>");
  4218. break;
  4219. case MNT_TEXT:
  4220. case MNT_WHITESPACE:
  4221. strNode = EscapeText( pText );
  4222. break;
  4223. case MNT_DOCUMENT_TYPE:
  4224. strNode = pText;
  4225. break;
  4226. case MNT_LONE_END_TAG:
  4227. strNode = MCD_T("</");
  4228. strNode += pText;
  4229. strNode += MCD_T(">");
  4230. break;
  4231. case MNT_CDATA_SECTION:
  4232. if ( MCD_PSZSTR(pText,MCD_T("]]>")) != NULL )
  4233. return false;
  4234. strNode = MCD_T("<![CDATA[");
  4235. strNode += pText;
  4236. strNode += MCD_T("]]>");
  4237. break;
  4238. }
  4239. return true;
  4240. }
  4241. MCD_STR CMarkup::x_EncodeCDATASection( MCD_PCSZ szData )
  4242. {
  4243. // Split CDATA Sections if there are any end delimiters
  4244. MCD_STR strData = MCD_T("<![CDATA[");
  4245. MCD_PCSZ pszNextStart = szData;
  4246. MCD_PCSZ pszEnd = MCD_PSZSTR( szData, MCD_T("]]>") );
  4247. while ( pszEnd )
  4248. {
  4249. strData += MCD_STR( pszNextStart, (int)(pszEnd - pszNextStart) );
  4250. strData += MCD_T("]]]]><![CDATA[>");
  4251. pszNextStart = pszEnd + 3;
  4252. pszEnd = MCD_PSZSTR( pszNextStart, MCD_T("]]>") );
  4253. }
  4254. strData += pszNextStart;
  4255. strData += MCD_T("]]>");
  4256. return strData;
  4257. }
  4258. bool CMarkup::x_SetData( int iPos, int nValue )
  4259. {
  4260. // Convert integer to string
  4261. MCD_CHAR szVal[25];
  4262. MCD_SPRINTF( MCD_SSZ(szVal), MCD_T("%d"), nValue );
  4263. return x_SetData( iPos, szVal, 0 );
  4264. }
  4265. bool CMarkup::x_SetData( int iPos, MCD_PCSZ szData, int nFlags )
  4266. {
  4267. if ( m_nDocFlags & MDF_READFILE )
  4268. return false;
  4269. MCD_STR strInsert;
  4270. if ( m_nDocFlags & MDF_WRITEFILE )
  4271. {
  4272. if ( ! iPos || m_nNodeType != 1 || ! ELEM(iPos).IsEmptyElement() )
  4273. return false; // only set data on current empty element (no other kinds of nodes)
  4274. }
  4275. if ( iPos == m_iPos && m_nNodeLength )
  4276. {
  4277. // Not an element
  4278. if ( ! x_CreateNode(strInsert, m_nNodeType, szData) )
  4279. return false;
  4280. x_DocChange( m_nNodeOffset, m_nNodeLength, strInsert );
  4281. x_AdjustForNode( m_iPosParent, iPos, MCD_STRLENGTH(strInsert) - m_nNodeLength );
  4282. m_nNodeLength = MCD_STRLENGTH(strInsert);
  4283. MARKUP_SETDEBUGSTATE;
  4284. return true;
  4285. }
  4286. // Set data in iPos element
  4287. if ( ! iPos || ELEM(iPos).iElemChild )
  4288. return false;
  4289. // Build strInsert from szData based on nFlags
  4290. if ( nFlags & MNF_WITHCDATA )
  4291. strInsert = x_EncodeCDATASection( szData );
  4292. else
  4293. strInsert = EscapeText( szData, nFlags );
  4294. // Insert
  4295. NodePos node( MNF_WITHNOLINES|MNF_REPLACE );
  4296. node.strMeta = strInsert;
  4297. int iPosBefore = 0;
  4298. int nReplace = x_InsertNew( iPos, iPosBefore, node );
  4299. int nAdjust = MCD_STRLENGTH(node.strMeta) - nReplace;
  4300. x_Adjust( iPos, nAdjust );
  4301. ELEM(iPos).nLength += nAdjust;
  4302. if ( ELEM(iPos).nFlags & MNF_ILLDATA )
  4303. ELEM(iPos).nFlags &= ~MNF_ILLDATA;
  4304. MARKUP_SETDEBUGSTATE;
  4305. return true;
  4306. }
  4307. MCD_STR CMarkup::x_GetData( int iPos )
  4308. {
  4309. if ( iPos == m_iPos && m_nNodeLength )
  4310. {
  4311. if ( m_nNodeType == MNT_COMMENT )
  4312. return MCD_STRMID( m_strDoc, m_nNodeOffset+4, m_nNodeLength-7 );
  4313. else if ( m_nNodeType == MNT_PROCESSING_INSTRUCTION )
  4314. return MCD_STRMID( m_strDoc, m_nNodeOffset+2, m_nNodeLength-4 );
  4315. else if ( m_nNodeType == MNT_CDATA_SECTION )
  4316. return MCD_STRMID( m_strDoc, m_nNodeOffset+9, m_nNodeLength-12 );
  4317. else if ( m_nNodeType == MNT_TEXT )
  4318. return UnescapeText( &(MCD_2PCSZ(m_strDoc))[m_nNodeOffset], m_nNodeLength, m_nDocFlags );
  4319. else if ( m_nNodeType == MNT_LONE_END_TAG )
  4320. return MCD_STRMID( m_strDoc, m_nNodeOffset+2, m_nNodeLength-3 );
  4321. return MCD_STRMID( m_strDoc, m_nNodeOffset, m_nNodeLength );
  4322. }
  4323. // Return a string representing data between start and end tag
  4324. // Return empty string if there are any children elements
  4325. MCD_STR strData;
  4326. if ( iPos && ! ELEM(iPos).IsEmptyElement() )
  4327. {
  4328. ElemPos* pElem = &ELEM(iPos);
  4329. int nStartContent = pElem->StartContent();
  4330. if ( pElem->IsUnparsed() )
  4331. {
  4332. TokenPos token( m_strDoc, m_nDocFlags, m_pFilePos );
  4333. token.m_nNext = nStartContent;
  4334. NodePos node;
  4335. m_pFilePos->m_nReadBufferStart = pElem->nStart;
  4336. while ( 1 )
  4337. {
  4338. m_pFilePos->m_nReadBufferRemoved = 0; // will be non-zero after ParseNode if read buffer shifted
  4339. token.ParseNode( node );
  4340. if ( m_pFilePos->m_nReadBufferRemoved )
  4341. {
  4342. pElem->nStart = 0;
  4343. MARKUP_SETDEBUGSTATE;
  4344. }
  4345. if ( node.nNodeType == MNT_TEXT )
  4346. strData += UnescapeText( &token.m_pDocText[node.nStart], node.nLength, m_nDocFlags );
  4347. else if ( node.nNodeType == MNT_CDATA_SECTION )
  4348. strData += MCD_STRMID( m_strDoc, node.nStart+9, node.nLength-12 );
  4349. else if ( node.nNodeType == MNT_ELEMENT )
  4350. {
  4351. MCD_STRCLEAR(strData);
  4352. break;
  4353. }
  4354. else if ( node.nNodeType == 0 )
  4355. {
  4356. if ( token.Match(m_pFilePos->m_elemstack.Current().strTagName) )
  4357. {
  4358. pElem->SetEndTagLen( node.nLength );
  4359. pElem->nLength = node.nStart + node.nLength - pElem->nStart;
  4360. m_pFilePos->m_elemstack.OutOfLevel();
  4361. }
  4362. else
  4363. {
  4364. MCD_STRCLEAR(strData);
  4365. }
  4366. break;
  4367. }
  4368. }
  4369. }
  4370. else if ( ! pElem->iElemChild )
  4371. {
  4372. // Quick scan for any tags inside content
  4373. int nContentLen = pElem->ContentLen();
  4374. MCD_PCSZ pszContent = &(MCD_2PCSZ(m_strDoc))[nStartContent];
  4375. MCD_PCSZ pszTag = MCD_PSZCHR( pszContent, '<' );
  4376. if ( pszTag && ((int)(pszTag-pszContent) < nContentLen) )
  4377. {
  4378. // Concatenate all CDATA Sections and text nodes, ignore other nodes
  4379. TokenPos token( m_strDoc, m_nDocFlags );
  4380. token.m_nNext = nStartContent;
  4381. NodePos node;
  4382. while ( token.m_nNext < nStartContent + nContentLen )
  4383. {
  4384. token.ParseNode( node );
  4385. if ( node.nNodeType == MNT_TEXT )
  4386. strData += UnescapeText( &token.m_pDocText[node.nStart], node.nLength, m_nDocFlags );
  4387. else if ( node.nNodeType == MNT_CDATA_SECTION )
  4388. strData += MCD_STRMID( m_strDoc, node.nStart+9, node.nLength-12 );
  4389. }
  4390. }
  4391. else // no tags
  4392. strData = UnescapeText( &(MCD_2PCSZ(m_strDoc))[nStartContent], nContentLen, m_nDocFlags );
  4393. }
  4394. }
  4395. return strData;
  4396. }
  4397. MCD_STR CMarkup::x_GetElemContent( int iPos ) const
  4398. {
  4399. if ( ! (m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE)) )
  4400. {
  4401. ElemPos* pElem = &ELEM(iPos);
  4402. if ( iPos && pElem->ContentLen() )
  4403. return MCD_STRMID( m_strDoc, pElem->StartContent(), pElem->ContentLen() );
  4404. }
  4405. return MCD_T("");
  4406. }
  4407. bool CMarkup::x_SetElemContent( MCD_PCSZ szContent )
  4408. {
  4409. MCD_STRCLEAR(m_strResult);
  4410. if ( m_nDocFlags & (MDF_READFILE|MDF_WRITEFILE) )
  4411. return false;
  4412. // Set data in iPos element only
  4413. if ( ! m_iPos )
  4414. return false;
  4415. if ( m_nNodeLength )
  4416. return false; // not an element
  4417. // Unlink all children
  4418. int iPos = m_iPos;
  4419. int iPosChild = ELEM(iPos).iElemChild;
  4420. bool bHadChild = (iPosChild != 0);
  4421. while ( iPosChild )
  4422. iPosChild = x_ReleaseSubDoc( iPosChild );
  4423. if ( bHadChild )
  4424. x_CheckSavedPos();
  4425. // Parse content
  4426. bool bWellFormed = true;
  4427. TokenPos token( szContent, m_nDocFlags );
  4428. int iPosVirtual = x_GetFreePos();
  4429. ELEM(iPosVirtual).ClearVirtualParent();
  4430. ELEM(iPosVirtual).SetLevel( ELEM(iPos).Level() + 1 );
  4431. iPosChild = x_ParseElem( iPosVirtual, token );
  4432. if ( ELEM(iPosVirtual).nFlags & MNF_ILLFORMED )
  4433. bWellFormed = false;
  4434. ELEM(iPos).nFlags = (ELEM(iPos).nFlags & ~MNF_ILLDATA) | (ELEM(iPosVirtual).nFlags & MNF_ILLDATA);
  4435. // Prepare insert and adjust offsets
  4436. NodePos node( MNF_WITHNOLINES|MNF_REPLACE );
  4437. node.strMeta = szContent;
  4438. int iPosBefore = 0;
  4439. int nReplace = x_InsertNew( iPos, iPosBefore, node );
  4440. // Adjust and link in the inserted elements
  4441. x_Adjust( iPosChild, node.nStart );
  4442. ELEM(iPosChild).nStart += node.nStart;
  4443. ELEM(iPos).iElemChild = iPosChild;
  4444. while ( iPosChild )
  4445. {
  4446. ELEM(iPosChild).iElemParent = iPos;
  4447. iPosChild = ELEM(iPosChild).iElemNext;
  4448. }
  4449. x_ReleasePos( iPosVirtual );
  4450. int nAdjust = MCD_STRLENGTH(node.strMeta) - nReplace;
  4451. x_Adjust( iPos, nAdjust, true );
  4452. ELEM(iPos).nLength += nAdjust;
  4453. x_SetPos( m_iPosParent, m_iPos, 0 );
  4454. return bWellFormed;
  4455. }
  4456. void CMarkup::x_DocChange( int nLeft, int nReplace, const MCD_STR& strInsert )
  4457. {
  4458. x_StrInsertReplace( m_strDoc, nLeft, nReplace, strInsert );
  4459. }
  4460. void CMarkup::x_Adjust( int iPos, int nShift, bool bAfterPos /*=false*/ )
  4461. {
  4462. // Loop through affected elements and adjust indexes
  4463. // Algorithm:
  4464. // 1. update children unless bAfterPos
  4465. // (if no children or bAfterPos is true, length of iPos not affected)
  4466. // 2. update starts of next siblings and their children
  4467. // 3. go up until there is a next sibling of a parent and update starts
  4468. // 4. step 2
  4469. int iPosTop = ELEM(iPos).iElemParent;
  4470. bool bPosFirst = bAfterPos; // mark as first to skip its children
  4471. // Stop when we've reached the virtual parent (which has no tags)
  4472. while ( ELEM(iPos).StartTagLen() )
  4473. {
  4474. // Were we at containing parent of affected position?
  4475. bool bPosTop = false;
  4476. if ( iPos == iPosTop )
  4477. {
  4478. // Move iPosTop up one towards root
  4479. iPosTop = ELEM(iPos).iElemParent;
  4480. bPosTop = true;
  4481. }
  4482. // Traverse to the next update position
  4483. if ( ! bPosTop && ! bPosFirst && ELEM(iPos).iElemChild )
  4484. {
  4485. // Depth first
  4486. iPos = ELEM(iPos).iElemChild;
  4487. }
  4488. else if ( ELEM(iPos).iElemNext )
  4489. {
  4490. iPos = ELEM(iPos).iElemNext;
  4491. }
  4492. else
  4493. {
  4494. // Look for next sibling of a parent of iPos
  4495. // When going back up, parents have already been done except iPosTop
  4496. while ( 1 )
  4497. {
  4498. iPos = ELEM(iPos).iElemParent;
  4499. if ( iPos == iPosTop )
  4500. break;
  4501. if ( ELEM(iPos).iElemNext )
  4502. {
  4503. iPos = ELEM(iPos).iElemNext;
  4504. break;
  4505. }
  4506. }
  4507. }
  4508. bPosFirst = false;
  4509. // Shift indexes at iPos
  4510. if ( iPos != iPosTop )
  4511. ELEM(iPos).nStart += nShift;
  4512. else
  4513. ELEM(iPos).nLength += nShift;
  4514. }
  4515. }
  4516. int CMarkup::x_InsertNew( int iPosParent, int& iPosRel, NodePos& node )
  4517. {
  4518. // Parent empty tag or tags with no content?
  4519. bool bEmptyParentTag = iPosParent && ELEM(iPosParent).IsEmptyElement();
  4520. bool bNoContentParentTags = iPosParent && ! ELEM(iPosParent).ContentLen();
  4521. if ( iPosRel && ! node.nLength ) // current position element?
  4522. {
  4523. node.nStart = ELEM(iPosRel).nStart;
  4524. if ( ! (node.nNodeFlags & MNF_INSERT) ) // follow iPosRel
  4525. node.nStart += ELEM(iPosRel).nLength;
  4526. }
  4527. else if ( bEmptyParentTag ) // parent has no separate end tag?
  4528. {
  4529. // Split empty parent element
  4530. if ( ELEM(iPosParent).nFlags & MNF_NONENDED )
  4531. node.nStart = ELEM(iPosParent).StartContent();
  4532. else
  4533. node.nStart = ELEM(iPosParent).StartContent() - 1;
  4534. }
  4535. else if ( node.nLength || (m_nDocFlags&MDF_WRITEFILE) ) // non-element node or a file mode zero length position?
  4536. {
  4537. if ( ! (node.nNodeFlags & MNF_INSERT) )
  4538. node.nStart += node.nLength; // after node or file mode position
  4539. }
  4540. else // no current node
  4541. {
  4542. // Insert relative to parent's content
  4543. if ( node.nNodeFlags & (MNF_INSERT|MNF_REPLACE) )
  4544. node.nStart = ELEM(iPosParent).StartContent(); // beginning of parent's content
  4545. else // in front of parent's end tag
  4546. node.nStart = ELEM(iPosParent).StartAfter() - ELEM(iPosParent).EndTagLen();
  4547. }
  4548. // Go up to start of next node, unless its splitting an empty element
  4549. if ( ! (node.nNodeFlags&(MNF_WITHNOLINES|MNF_REPLACE)) && ! bEmptyParentTag )
  4550. {
  4551. TokenPos token( m_strDoc, m_nDocFlags );
  4552. node.nStart = token.WhitespaceToTag( node.nStart );
  4553. }
  4554. // Is insert relative to element position? (i.e. not other kind of node)
  4555. if ( ! node.nLength )
  4556. {
  4557. // Modify iPosRel to reflect position before
  4558. if ( iPosRel )
  4559. {
  4560. if ( node.nNodeFlags & MNF_INSERT )
  4561. {
  4562. if ( ! (ELEM(iPosRel).nFlags & MNF_FIRST) )
  4563. iPosRel = ELEM(iPosRel).iElemPrev;
  4564. else
  4565. iPosRel = 0;
  4566. }
  4567. }
  4568. else if ( ! (node.nNodeFlags & MNF_INSERT) )
  4569. {
  4570. // If parent has a child, add after last child
  4571. if ( ELEM(iPosParent).iElemChild )
  4572. iPosRel = ELEM(ELEM(iPosParent).iElemChild).iElemPrev;
  4573. }
  4574. }
  4575. // Get node length (needed for x_AddNode and x_AddSubDoc in file write mode)
  4576. node.nLength = MCD_STRLENGTH(node.strMeta);
  4577. // Prepare end of lines
  4578. if ( (! (node.nNodeFlags & MNF_WITHNOLINES)) && (bEmptyParentTag || bNoContentParentTags) )
  4579. node.nStart += MCD_EOLLEN;
  4580. if ( ! (node.nNodeFlags & MNF_WITHNOLINES) )
  4581. node.strMeta += MCD_EOL;
  4582. // Calculate insert offset and replace length
  4583. int nReplace = 0;
  4584. int nInsertAt = node.nStart;
  4585. if ( bEmptyParentTag )
  4586. {
  4587. MCD_STR strTagName = x_GetTagName( iPosParent );
  4588. MCD_STR strFormat;
  4589. if ( node.nNodeFlags & MNF_WITHNOLINES )
  4590. strFormat = MCD_T(">");
  4591. else
  4592. strFormat = MCD_T(">") MCD_EOL;
  4593. strFormat += node.strMeta;
  4594. strFormat += MCD_T("</");
  4595. strFormat += strTagName;
  4596. node.strMeta = strFormat;
  4597. if ( ELEM(iPosParent).nFlags & MNF_NONENDED )
  4598. {
  4599. nInsertAt = ELEM(iPosParent).StartAfter() - 1;
  4600. nReplace = 0;
  4601. ELEM(iPosParent).nFlags ^= MNF_NONENDED;
  4602. }
  4603. else
  4604. {
  4605. nInsertAt = ELEM(iPosParent).StartAfter() - 2;
  4606. nReplace = 1;
  4607. ELEM(iPosParent).AdjustStartTagLen( -1 );
  4608. }
  4609. ELEM(iPosParent).SetEndTagLen( 3 + MCD_STRLENGTH(strTagName) );
  4610. }
  4611. else
  4612. {
  4613. if ( node.nNodeFlags & MNF_REPLACE )
  4614. {
  4615. nInsertAt = ELEM(iPosParent).StartContent();
  4616. nReplace = ELEM(iPosParent).ContentLen();
  4617. }
  4618. else if ( bNoContentParentTags )
  4619. {
  4620. node.strMeta = MCD_EOL + node.strMeta;
  4621. nInsertAt = ELEM(iPosParent).StartContent();
  4622. }
  4623. }
  4624. if ( m_nDocFlags & MDF_WRITEFILE )
  4625. {
  4626. // Check if buffer is full
  4627. int nNewDocLength = MCD_STRLENGTH(m_strDoc) + MCD_STRLENGTH(node.strMeta) - nReplace;
  4628. int nFlushTo = node.nStart;
  4629. MCD_STRCLEAR( m_strResult );
  4630. if ( bEmptyParentTag )
  4631. nFlushTo = ELEM(iPosParent).nStart;
  4632. if ( nFlushTo && nNewDocLength > m_pFilePos->m_nBlockSizeBasis )
  4633. {
  4634. int nDocCapacity = MCD_STRCAPACITY(m_strDoc);
  4635. if ( nNewDocLength > nDocCapacity )
  4636. {
  4637. if ( bEmptyParentTag )
  4638. ELEM(iPosParent).nStart = 0;
  4639. node.nStart -= nFlushTo;
  4640. nInsertAt -= nFlushTo;
  4641. m_pFilePos->FileFlush( m_strDoc, nFlushTo );
  4642. m_strResult = m_pFilePos->m_strIOResult;
  4643. }
  4644. }
  4645. }
  4646. x_DocChange( nInsertAt, nReplace, node.strMeta );
  4647. return nReplace;
  4648. }
  4649. bool CMarkup::x_AddElem( MCD_PCSZ pName, int nValue, int nFlags )
  4650. {
  4651. // Convert integer to string
  4652. MCD_CHAR szVal[25];
  4653. MCD_SPRINTF( MCD_SSZ(szVal), MCD_T("%d"), nValue );
  4654. return x_AddElem( pName, szVal, nFlags );
  4655. }
  4656. bool CMarkup::x_AddElem( MCD_PCSZ pName, MCD_PCSZ pValue, int nFlags )
  4657. {
  4658. if ( m_nDocFlags & MDF_READFILE )
  4659. return false;
  4660. if ( nFlags & MNF_CHILD )
  4661. {
  4662. // Adding a child element under main position
  4663. if ( ! m_iPos || (m_nDocFlags & MDF_WRITEFILE) )
  4664. return false;
  4665. }
  4666. // Cannot have data in non-ended element
  4667. if ( (nFlags&MNF_WITHNOEND) && pValue && pValue[0] )
  4668. return false;
  4669. // Node and element structures
  4670. NodePos node( nFlags );
  4671. int iPosParent = 0, iPosBefore = 0;
  4672. int iPos = x_GetFreePos();
  4673. ElemPos* pElem = &ELEM(iPos);
  4674. // Locate where to add element relative to current node
  4675. if ( nFlags & MNF_CHILD )
  4676. {
  4677. iPosParent = m_iPos;
  4678. iPosBefore = m_iPosChild;
  4679. }
  4680. else
  4681. {
  4682. iPosParent = m_iPosParent;
  4683. iPosBefore = m_iPos;
  4684. node.nStart = m_nNodeOffset;
  4685. node.nLength = m_nNodeLength;
  4686. }
  4687. // Create string for insert
  4688. // If no pValue is specified, an empty element is created
  4689. // i.e. either <NAME>value</NAME> or <NAME/>
  4690. //
  4691. int nLenName = MCD_PSZLEN(pName);
  4692. if ( ! pValue || ! pValue[0] )
  4693. {
  4694. // <NAME/> empty element
  4695. MCD_BLDRESERVE( node.strMeta, nLenName + 4 );
  4696. MCD_BLDAPPEND1( node.strMeta, '<' );
  4697. MCD_BLDAPPENDN( node.strMeta, pName, nLenName );
  4698. if ( nFlags & MNF_WITHNOEND )
  4699. {
  4700. MCD_BLDAPPEND1( node.strMeta, '>' );
  4701. }
  4702. else
  4703. {
  4704. if ( nFlags & MNF_WITHXHTMLSPACE )
  4705. {
  4706. MCD_BLDAPPENDN( node.strMeta, MCD_T(" />"), 3 );
  4707. }
  4708. else
  4709. {
  4710. MCD_BLDAPPENDN( node.strMeta, MCD_T("/>"), 2 );
  4711. }
  4712. }
  4713. MCD_BLDRELEASE( node.strMeta );
  4714. pElem->nLength = MCD_STRLENGTH( node.strMeta );
  4715. pElem->SetStartTagLen( pElem->nLength );
  4716. pElem->SetEndTagLen( 0 );
  4717. }
  4718. else
  4719. {
  4720. // <NAME>value</NAME>
  4721. MCD_STR strValue;
  4722. if ( nFlags & MNF_WITHCDATA )
  4723. strValue = x_EncodeCDATASection( pValue );
  4724. else
  4725. strValue = EscapeText( pValue, nFlags );
  4726. int nLenValue = MCD_STRLENGTH(strValue);
  4727. pElem->nLength = nLenName * 2 + nLenValue + 5;
  4728. MCD_BLDRESERVE( node.strMeta, pElem->nLength );
  4729. MCD_BLDAPPEND1( node.strMeta, '<' );
  4730. MCD_BLDAPPENDN( node.strMeta, pName, nLenName );
  4731. MCD_BLDAPPEND1( node.strMeta, '>' );
  4732. MCD_BLDAPPENDN( node.strMeta, MCD_2PCSZ(strValue), nLenValue );
  4733. MCD_BLDAPPENDN( node.strMeta, MCD_T("</"), 2 );
  4734. MCD_BLDAPPENDN( node.strMeta, pName, nLenName );
  4735. MCD_BLDAPPEND1( node.strMeta, '>' );
  4736. MCD_BLDRELEASE( node.strMeta );
  4737. pElem->SetEndTagLen( nLenName + 3 );
  4738. pElem->SetStartTagLen( nLenName + 2 );
  4739. }
  4740. // Insert
  4741. int nReplace = x_InsertNew( iPosParent, iPosBefore, node );
  4742. pElem->nStart = node.nStart;
  4743. pElem->iElemChild = 0;
  4744. if ( nFlags & MNF_WITHNOEND )
  4745. pElem->nFlags = MNF_NONENDED;
  4746. else
  4747. pElem->nFlags = 0;
  4748. if ( m_nDocFlags & MDF_WRITEFILE )
  4749. {
  4750. iPosParent = x_UnlinkPrevElem( iPosParent, iPosBefore, iPos );
  4751. TokenPos token( m_strDoc, m_nDocFlags );
  4752. token.m_nL = pElem->nStart + 1;
  4753. token.m_nR = pElem->nStart + nLenName;
  4754. m_pFilePos->m_elemstack.PushTagAndCount( token );
  4755. }
  4756. else
  4757. {
  4758. x_LinkElem( iPosParent, iPosBefore, iPos );
  4759. x_Adjust( iPos, MCD_STRLENGTH(node.strMeta) - nReplace );
  4760. }
  4761. if ( nFlags & MNF_CHILD )
  4762. x_SetPos( m_iPosParent, iPosParent, iPos );
  4763. else
  4764. x_SetPos( iPosParent, iPos, 0 );
  4765. return true;
  4766. }
  4767. MCD_STR CMarkup::x_GetSubDoc( int iPos )
  4768. {
  4769. if ( iPos && ! (m_nDocFlags&MDF_WRITEFILE) )
  4770. {
  4771. if ( ! (m_nDocFlags&MDF_READFILE) )
  4772. {
  4773. TokenPos token( m_strDoc, m_nDocFlags );
  4774. token.WhitespaceToTag( ELEM(iPos).StartAfter() );
  4775. token.m_nL = ELEM(iPos).nStart;
  4776. return token.GetTokenText();
  4777. }
  4778. }
  4779. return MCD_T("");
  4780. }
  4781. bool CMarkup::x_AddSubDoc( MCD_PCSZ pSubDoc, int nFlags )
  4782. {
  4783. if ( m_nDocFlags & MDF_READFILE || ((nFlags & MNF_CHILD) && (m_nDocFlags & MDF_WRITEFILE)) )
  4784. return false;
  4785. MCD_STRCLEAR(m_strResult);
  4786. NodePos node( nFlags );
  4787. int iPosParent, iPosBefore;
  4788. if ( nFlags & MNF_CHILD )
  4789. {
  4790. // Add a subdocument under main position, before or after child
  4791. if ( ! m_iPos )
  4792. return false;
  4793. iPosParent = m_iPos;
  4794. iPosBefore = m_iPosChild;
  4795. }
  4796. else
  4797. {
  4798. // Add a subdocument under parent position, before or after main
  4799. iPosParent = m_iPosParent;
  4800. iPosBefore = m_iPos;
  4801. node.nStart = m_nNodeOffset;
  4802. node.nLength = m_nNodeLength;
  4803. }
  4804. // Parse subdocument, generating indexes based on the subdocument string to be offset later
  4805. bool bWellFormed = true;
  4806. TokenPos token( pSubDoc, m_nDocFlags );
  4807. int iPosVirtual = x_GetFreePos();
  4808. ELEM(iPosVirtual).ClearVirtualParent();
  4809. ELEM(iPosVirtual).SetLevel( ELEM(iPosParent).Level() + 1 );
  4810. int iPos = x_ParseElem( iPosVirtual, token );
  4811. if ( (!iPos) || ELEM(iPosVirtual).nFlags & MNF_ILLFORMED )
  4812. bWellFormed = false;
  4813. if ( ELEM(iPosVirtual).nFlags & MNF_ILLDATA )
  4814. ELEM(iPosParent).nFlags |= MNF_ILLDATA;
  4815. // File write mode handling
  4816. bool bBypassSubDoc = false;
  4817. if ( m_nDocFlags & MDF_WRITEFILE )
  4818. {
  4819. // Current position will bypass subdoc unless well-formed single element
  4820. if ( (! bWellFormed) || ELEM(iPos).iElemChild || ELEM(iPos).iElemNext )
  4821. bBypassSubDoc = true;
  4822. // Count tag names of top level elements (usually one) in given markup
  4823. int iPosTop = iPos;
  4824. while ( iPosTop )
  4825. {
  4826. token.m_nNext = ELEM(iPosTop).nStart + 1;
  4827. token.FindName();
  4828. m_pFilePos->m_elemstack.PushTagAndCount( token );
  4829. iPosTop = ELEM(iPosTop).iElemNext;
  4830. }
  4831. }
  4832. // Extract subdocument without leading/trailing nodes
  4833. int nExtractStart = 0;
  4834. int iPosLast = ELEM(iPos).iElemPrev;
  4835. if ( bWellFormed )
  4836. {
  4837. nExtractStart = ELEM(iPos).nStart;
  4838. int nExtractLength = ELEM(iPos).nLength;
  4839. if ( iPos != iPosLast )
  4840. {
  4841. nExtractLength = ELEM(iPosLast).nStart - nExtractStart + ELEM(iPosLast).nLength;
  4842. bWellFormed = false; // treat as subdoc here, but return not well-formed
  4843. }
  4844. MCD_STRASSIGN(node.strMeta,&pSubDoc[nExtractStart],nExtractLength);
  4845. }
  4846. else
  4847. {
  4848. node.strMeta = pSubDoc;
  4849. node.nNodeFlags |= MNF_WITHNOLINES;
  4850. }
  4851. // Insert
  4852. int nReplace = x_InsertNew( iPosParent, iPosBefore, node );
  4853. // Clean up indexes
  4854. if ( m_nDocFlags & MDF_WRITEFILE )
  4855. {
  4856. if ( bBypassSubDoc )
  4857. {
  4858. // Release indexes used in parsing the subdocument
  4859. m_iPosParent = x_UnlinkPrevElem( iPosParent, iPosBefore, 0 );
  4860. m_iPosFree = 1;
  4861. m_iPosDeleted = 0;
  4862. m_iPos = 0;
  4863. m_nNodeOffset = node.nStart + node.nLength;
  4864. m_nNodeLength = 0;
  4865. m_nNodeType = 0;
  4866. MARKUP_SETDEBUGSTATE;
  4867. return bWellFormed;
  4868. }
  4869. else // single element added
  4870. {
  4871. m_iPos = iPos;
  4872. ElemPos* pElem = &ELEM(iPos);
  4873. pElem->nStart = node.nStart;
  4874. m_iPosParent = x_UnlinkPrevElem( iPosParent, iPosBefore, iPos );
  4875. x_ReleasePos( iPosVirtual );
  4876. }
  4877. }
  4878. else
  4879. {
  4880. // Adjust and link in the inserted elements
  4881. // iPosVirtual will stop it from affecting rest of document
  4882. int nAdjust = node.nStart - nExtractStart;
  4883. if ( iPos && nAdjust )
  4884. {
  4885. x_Adjust( iPos, nAdjust );
  4886. ELEM(iPos).nStart += nAdjust;
  4887. }
  4888. int iPosChild = iPos;
  4889. while ( iPosChild )
  4890. {
  4891. int iPosNext = ELEM(iPosChild).iElemNext;
  4892. x_LinkElem( iPosParent, iPosBefore, iPosChild );
  4893. iPosBefore = iPosChild;
  4894. iPosChild = iPosNext;
  4895. }
  4896. x_ReleasePos( iPosVirtual );
  4897. // Now adjust remainder of document
  4898. x_Adjust( iPosLast, MCD_STRLENGTH(node.strMeta) - nReplace, true );
  4899. }
  4900. // Set position to top element of subdocument
  4901. if ( nFlags & MNF_CHILD )
  4902. x_SetPos( m_iPosParent, iPosParent, iPos );
  4903. else // Main
  4904. x_SetPos( m_iPosParent, iPos, 0 );
  4905. return bWellFormed;
  4906. }
  4907. int CMarkup::x_RemoveElem( int iPos )
  4908. {
  4909. // Determine whether any whitespace up to next tag
  4910. TokenPos token( m_strDoc, m_nDocFlags );
  4911. int nAfterEnd = token.WhitespaceToTag( ELEM(iPos).StartAfter() );
  4912. // Remove from document, adjust affected indexes, and unlink
  4913. int nLen = nAfterEnd - ELEM(iPos).nStart;
  4914. x_DocChange( ELEM(iPos).nStart, nLen, MCD_STR() );
  4915. x_Adjust( iPos, - nLen, true );
  4916. int iPosPrev = x_UnlinkElem( iPos );
  4917. x_CheckSavedPos();
  4918. return iPosPrev; // new position
  4919. }
  4920. void CMarkup::x_LinkElem( int iPosParent, int iPosBefore, int iPos )
  4921. {
  4922. // Update links between elements and initialize nFlags
  4923. ElemPos* pElem = &ELEM(iPos);
  4924. if ( m_nDocFlags & MDF_WRITEFILE )
  4925. {
  4926. // In file write mode, only keep virtual parent 0 plus one element
  4927. if ( iPosParent )
  4928. x_ReleasePos( iPosParent );
  4929. else if ( iPosBefore )
  4930. x_ReleasePos( iPosBefore );
  4931. iPosParent = 0;
  4932. ELEM(iPosParent).iElemChild = iPos;
  4933. pElem->iElemParent = iPosParent;
  4934. pElem->iElemPrev = iPos;
  4935. pElem->iElemNext = 0;
  4936. pElem->nFlags |= MNF_FIRST;
  4937. }
  4938. else
  4939. {
  4940. pElem->iElemParent = iPosParent;
  4941. if ( iPosBefore )
  4942. {
  4943. // Link in after iPosBefore
  4944. pElem->nFlags &= ~MNF_FIRST;
  4945. pElem->iElemNext = ELEM(iPosBefore).iElemNext;
  4946. if ( pElem->iElemNext )
  4947. ELEM(pElem->iElemNext).iElemPrev = iPos;
  4948. else
  4949. ELEM(ELEM(iPosParent).iElemChild).iElemPrev = iPos;
  4950. ELEM(iPosBefore).iElemNext = iPos;
  4951. pElem->iElemPrev = iPosBefore;
  4952. }
  4953. else
  4954. {
  4955. // Link in as first child
  4956. pElem->nFlags |= MNF_FIRST;
  4957. if ( ELEM(iPosParent).iElemChild )
  4958. {
  4959. pElem->iElemNext = ELEM(iPosParent).iElemChild;
  4960. pElem->iElemPrev = ELEM(pElem->iElemNext).iElemPrev;
  4961. ELEM(pElem->iElemNext).iElemPrev = iPos;
  4962. ELEM(pElem->iElemNext).nFlags ^= MNF_FIRST;
  4963. }
  4964. else
  4965. {
  4966. pElem->iElemNext = 0;
  4967. pElem->iElemPrev = iPos;
  4968. }
  4969. ELEM(iPosParent).iElemChild = iPos;
  4970. }
  4971. if ( iPosParent )
  4972. pElem->SetLevel( ELEM(iPosParent).Level() + 1 );
  4973. }
  4974. }
  4975. int CMarkup::x_UnlinkElem( int iPos )
  4976. {
  4977. // Fix links to remove element and mark as deleted
  4978. // return previous position or zero if none
  4979. ElemPos* pElem = &ELEM(iPos);
  4980. // Find previous sibling and bypass removed element
  4981. int iPosPrev = 0;
  4982. if ( pElem->nFlags & MNF_FIRST )
  4983. {
  4984. if ( pElem->iElemNext ) // set next as first child
  4985. {
  4986. ELEM(pElem->iElemParent).iElemChild = pElem->iElemNext;
  4987. ELEM(pElem->iElemNext).iElemPrev = pElem->iElemPrev;
  4988. ELEM(pElem->iElemNext).nFlags |= MNF_FIRST;
  4989. }
  4990. else // no children remaining
  4991. ELEM(pElem->iElemParent).iElemChild = 0;
  4992. }
  4993. else
  4994. {
  4995. iPosPrev = pElem->iElemPrev;
  4996. ELEM(iPosPrev).iElemNext = pElem->iElemNext;
  4997. if ( pElem->iElemNext )
  4998. ELEM(pElem->iElemNext).iElemPrev = iPosPrev;
  4999. else
  5000. ELEM(ELEM(pElem->iElemParent).iElemChild).iElemPrev = iPosPrev;
  5001. }
  5002. x_ReleaseSubDoc( iPos );
  5003. return iPosPrev;
  5004. }
  5005. int CMarkup::x_UnlinkPrevElem( int iPosParent, int iPosBefore, int iPos )
  5006. {
  5007. // In file write mode, only keep virtual parent 0 plus one element if currently at element
  5008. if ( iPosParent )
  5009. {
  5010. x_ReleasePos( iPosParent );
  5011. iPosParent = 0;
  5012. }
  5013. else if ( iPosBefore )
  5014. x_ReleasePos( iPosBefore );
  5015. ELEM(iPosParent).iElemChild = iPos;
  5016. ELEM(iPosParent).nLength = MCD_STRLENGTH(m_strDoc);
  5017. if ( iPos )
  5018. {
  5019. ElemPos* pElem = &ELEM(iPos);
  5020. pElem->iElemParent = iPosParent;
  5021. pElem->iElemPrev = iPos;
  5022. pElem->iElemNext = 0;
  5023. pElem->nFlags |= MNF_FIRST;
  5024. }
  5025. return iPosParent;
  5026. }
  5027. int CMarkup::x_ReleasePos( int iPos )
  5028. {
  5029. int iPosNext = ELEM(iPos).iElemNext;
  5030. ELEM(iPos).iElemNext = m_iPosDeleted;
  5031. ELEM(iPos).nFlags = MNF_DELETED;
  5032. m_iPosDeleted = iPos;
  5033. return iPosNext;
  5034. }
  5035. int CMarkup::x_ReleaseSubDoc( int iPos )
  5036. {
  5037. // Mark position structures as deleted by depth first traversal
  5038. // Tricky because iElemNext used in traversal is overwritten for linked list of deleted
  5039. // Return value is what iElemNext was before being overwritten
  5040. //
  5041. int iPosNext = 0, iPosTop = iPos;
  5042. while ( 1 )
  5043. {
  5044. if ( ELEM(iPos).iElemChild )
  5045. iPos = ELEM(iPos).iElemChild;
  5046. else
  5047. {
  5048. while ( 1 )
  5049. {
  5050. iPosNext = x_ReleasePos( iPos );
  5051. if ( iPosNext || iPos == iPosTop )
  5052. break;
  5053. iPos = ELEM(iPos).iElemParent;
  5054. }
  5055. if ( iPos == iPosTop )
  5056. break;
  5057. iPos = iPosNext;
  5058. }
  5059. }
  5060. return iPosNext;
  5061. }
  5062. void CMarkup::x_CheckSavedPos()
  5063. {
  5064. // Remove any saved positions now pointing to deleted elements
  5065. // Must be done as part of element removal before position reassigned
  5066. if ( m_pSavedPosMaps->m_pMaps )
  5067. {
  5068. int nMap = 0;
  5069. while ( m_pSavedPosMaps->m_pMaps[nMap] )
  5070. {
  5071. SavedPosMap* pMap = m_pSavedPosMaps->m_pMaps[nMap];
  5072. for ( int nSlot = 0; nSlot < pMap->nMapSize; ++nSlot )
  5073. {
  5074. SavedPos* pSavedPos = pMap->pTable[nSlot];
  5075. if ( pSavedPos )
  5076. {
  5077. int nOffset = 0;
  5078. int nSavedPosCount = 0;
  5079. while ( 1 )
  5080. {
  5081. if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_USED )
  5082. {
  5083. int iPos = pSavedPos[nOffset].iPos;
  5084. if ( ! (ELEM(iPos).nFlags & MNF_DELETED) )
  5085. {
  5086. if ( nSavedPosCount < nOffset )
  5087. {
  5088. pSavedPos[nSavedPosCount] = pSavedPos[nOffset];
  5089. pSavedPos[nSavedPosCount].nSavedPosFlags &= ~SavedPos::SPM_LAST;
  5090. }
  5091. ++nSavedPosCount;
  5092. }
  5093. }
  5094. if ( pSavedPos[nOffset].nSavedPosFlags & SavedPos::SPM_LAST )
  5095. {
  5096. while ( nSavedPosCount <= nOffset )
  5097. pSavedPos[nSavedPosCount++].nSavedPosFlags &= ~SavedPos::SPM_USED;
  5098. break;
  5099. }
  5100. ++nOffset;
  5101. }
  5102. }
  5103. }
  5104. ++nMap;
  5105. }
  5106. }
  5107. }
  5108. void CMarkup::x_AdjustForNode( int iPosParent, int iPos, int nShift )
  5109. {
  5110. // Adjust affected indexes
  5111. bool bAfterPos = true;
  5112. if ( ! iPos )
  5113. {
  5114. // Change happened before or at first element under iPosParent
  5115. // If there are any children of iPosParent, adjust from there
  5116. // otherwise start at parent and adjust from there
  5117. iPos = ELEM(iPosParent).iElemChild;
  5118. if ( iPos )
  5119. {
  5120. ELEM(iPos).nStart += nShift;
  5121. bAfterPos = false;
  5122. }
  5123. else
  5124. {
  5125. iPos = iPosParent;
  5126. ELEM(iPos).nLength += nShift;
  5127. }
  5128. }
  5129. x_Adjust( iPos, nShift, bAfterPos );
  5130. }
  5131. bool CMarkup::x_AddNode( int nNodeType, MCD_PCSZ pText, int nNodeFlags )
  5132. {
  5133. if ( m_nDocFlags & MDF_READFILE )
  5134. return false;
  5135. // Comments, DTDs, and processing instructions are followed by CRLF
  5136. // Other nodes are usually concerned with mixed content, so no CRLF
  5137. if ( ! (nNodeType & (MNT_PROCESSING_INSTRUCTION|MNT_COMMENT|MNT_DOCUMENT_TYPE)) )
  5138. nNodeFlags |= MNF_WITHNOLINES;
  5139. // Add node of nNodeType after current node position
  5140. NodePos node( nNodeFlags );
  5141. if ( ! x_CreateNode(node.strMeta, nNodeType, pText) )
  5142. return false;
  5143. // Insert the new node relative to current node
  5144. node.nStart = m_nNodeOffset;
  5145. node.nLength = m_nNodeLength;
  5146. node.nNodeType = nNodeType;
  5147. int iPosBefore = m_iPos;
  5148. int nReplace = x_InsertNew( m_iPosParent, iPosBefore, node );
  5149. // If its a new element, create an ElemPos
  5150. int iPos = iPosBefore;
  5151. ElemPos* pElem = NULL;
  5152. if ( nNodeType == MNT_ELEMENT )
  5153. {
  5154. // Set indexes
  5155. iPos = x_GetFreePos();
  5156. pElem = &ELEM(iPos);
  5157. pElem->nStart = node.nStart;
  5158. pElem->SetStartTagLen( node.nLength );
  5159. pElem->SetEndTagLen( 0 );
  5160. pElem->nLength = node.nLength;
  5161. node.nStart = 0;
  5162. node.nLength = 0;
  5163. pElem->iElemChild = 0;
  5164. pElem->nFlags = 0;
  5165. x_LinkElem( m_iPosParent, iPosBefore, iPos );
  5166. }
  5167. if ( m_nDocFlags & MDF_WRITEFILE )
  5168. {
  5169. m_iPosParent = x_UnlinkPrevElem( m_iPosParent, iPosBefore, iPos );
  5170. if ( nNodeType == MNT_ELEMENT )
  5171. {
  5172. TokenPos token( m_strDoc, m_nDocFlags );
  5173. token.m_nL = pElem->nStart + 1;
  5174. token.m_nR = pElem->nStart + pElem->nLength - 3;
  5175. m_pFilePos->m_elemstack.PushTagAndCount( token );
  5176. }
  5177. }
  5178. else // need to adjust element positions after iPos
  5179. x_AdjustForNode( m_iPosParent, iPos, MCD_STRLENGTH(node.strMeta) - nReplace );
  5180. // Store current position
  5181. m_iPos = iPos;
  5182. m_iPosChild = 0;
  5183. m_nNodeOffset = node.nStart;
  5184. m_nNodeLength = node.nLength;
  5185. m_nNodeType = nNodeType;
  5186. MARKUP_SETDEBUGSTATE;
  5187. return true;
  5188. }
  5189. void CMarkup::x_RemoveNode( int iPosParent, int& iPos, int& nNodeType, int& nNodeOffset, int& nNodeLength )
  5190. {
  5191. int iPosPrev = iPos;
  5192. // Removing an element?
  5193. if ( nNodeType == MNT_ELEMENT )
  5194. {
  5195. nNodeOffset = ELEM(iPos).nStart;
  5196. nNodeLength = ELEM(iPos).nLength;
  5197. iPosPrev = x_UnlinkElem( iPos );
  5198. x_CheckSavedPos();
  5199. }
  5200. // Find previous node type, offset and length
  5201. int nPrevOffset = 0;
  5202. if ( iPosPrev )
  5203. nPrevOffset = ELEM(iPosPrev).StartAfter();
  5204. else if ( iPosParent )
  5205. nPrevOffset = ELEM(iPosParent).StartContent();
  5206. TokenPos token( m_strDoc, m_nDocFlags );
  5207. NodePos node;
  5208. token.m_nNext = nPrevOffset;
  5209. int nPrevType = 0;
  5210. while ( token.m_nNext < nNodeOffset )
  5211. {
  5212. nPrevOffset = token.m_nNext;
  5213. nPrevType = token.ParseNode( node );
  5214. }
  5215. int nPrevLength = nNodeOffset - nPrevOffset;
  5216. if ( ! nPrevLength )
  5217. {
  5218. // Previous node is iPosPrev element
  5219. nPrevOffset = 0;
  5220. if ( iPosPrev )
  5221. nPrevType = MNT_ELEMENT;
  5222. }
  5223. // Remove node from document
  5224. x_DocChange( nNodeOffset, nNodeLength, MCD_STR() );
  5225. x_AdjustForNode( iPosParent, iPosPrev, - nNodeLength );
  5226. // Was removed node a lone end tag?
  5227. if ( nNodeType == MNT_LONE_END_TAG )
  5228. {
  5229. // See if we can unset parent MNF_ILLDATA flag
  5230. token.m_nNext = ELEM(iPosParent).StartContent();
  5231. int nEndOfContent = token.m_nNext + ELEM(iPosParent).ContentLen();
  5232. int iPosChild = ELEM(iPosParent).iElemChild;
  5233. while ( token.m_nNext < nEndOfContent )
  5234. {
  5235. if ( token.ParseNode(node) <= 0 )
  5236. break;
  5237. if ( node.nNodeType == MNT_ELEMENT )
  5238. {
  5239. token.m_nNext = ELEM(iPosChild).StartAfter();
  5240. iPosChild = ELEM(iPosChild).iElemNext;
  5241. }
  5242. }
  5243. if ( token.m_nNext == nEndOfContent )
  5244. ELEM(iPosParent).nFlags &= ~MNF_ILLDATA;
  5245. }
  5246. nNodeType = nPrevType;
  5247. nNodeOffset = nPrevOffset;
  5248. nNodeLength = nPrevLength;
  5249. iPos = iPosPrev;
  5250. }