Previous 199869 Revisions Next

r30789 Tuesday 3rd June, 2014 at 00:47:27 UTC by Couriersud
Quite a number of changes:
- templates to provide operations for fixed matrix sizes
- sorting of nets to increase convergence
- successive over relaxation - for popeye a parameter of 1.05 works best.
- The code commented out may provide benefits on different architectures like true
 vector architectures.
 
Popeye run peaks at 1800% (up 20%) and pong has a slight performance increase as well.
[src/emu/netlist]plists.h
[src/emu/netlist/analog]nld_solver.c nld_solver.h

trunk/src/emu/netlist/analog/nld_solver.h
r30788r30789
1111
1212//#define ATTR_ALIGNED(N) __attribute__((aligned(N)))
1313#define ATTR_ALIGNED(N) ATTR_ALIGN
14//#undef RESTRICT
15//#define RESTRICT
1416
17
1518// ----------------------------------------------------------------------------------------
1619// Macros
1720// ----------------------------------------------------------------------------------------
r30788r30789
4043    netlist_time m_nt_sync_delay;
4144};
4245
43class vector_t
46class vector_ops_t
4447{
4548public:
4649
47    vector_t(int size)
50    vector_ops_t(int size)
4851    : m_dim(size)
4952    {
5053    }
5154
52    virtual ~vector_t() {}
55    virtual ~vector_ops_t() {}
5356
5457    ATTR_ALIGNED(64) double * RESTRICT  m_V;
5558
5659    virtual const double sum(const double * v) = 0;
57    virtual void sum2(const double * v1, const double * v2, double &s1, double &s2) = 0;
58    virtual void sum2a(const double * v1, const double * v2, const double * v3abs, double &s1, double &s2, double &s3abs) = 0;
60    virtual void sum2(const double * RESTRICT v1, const double * RESTRICT v2, double & RESTRICT  s1, double & RESTRICT s2) = 0;
61    virtual void addmult(double * RESTRICT v1, const double * RESTRICT v2, const double &mult) = 0;
62    virtual void sum2a(const double * RESTRICT v1, const double * RESTRICT v2, const double * RESTRICT v3abs, double & RESTRICT s1, double & RESTRICT s2, double & RESTRICT s3abs) = 0;
5963
6064    virtual const double sumabs(const double * v) = 0;
6165
r30788r30789
6771};
6872
6973template <int m_N>
70class vector_imp_t : public vector_t
74class vector_ops_impl_t : public vector_ops_t
7175{
7276public:
7377
74    vector_imp_t()
75    : vector_t(m_N)
78    vector_ops_impl_t()
79    : vector_ops_t(m_N)
7680    {
7781    }
7882
79    vector_imp_t(int size)
80    : vector_t(size)
83    vector_ops_impl_t(int size)
84    : vector_ops_t(size)
8185    {
8286        assert(m_N == 0);
8387    }
8488
85    virtual ~vector_imp_t() {}
89    virtual ~vector_ops_impl_t() {}
8690
8791    ATTR_HOT inline const int N() const { if (m_N == 0) return m_dim; else return m_N; }
8892
8993    const double sum(const double * v)
9094    {
91        const double * RESTRICT vl = v;
95        const double * RESTRICT vl = v;
9296        double tmp = 0.0;
9397        for (int i=0; i < N(); i++)
9498            tmp += vl[i];
9599        return tmp;
96100    }
97101
98    void sum2(const double * v1, const double * v2, double &s1, double &s2)
102    void sum2(const double * RESTRICT v1, const double * RESTRICT v2, double & RESTRICT s1, double & RESTRICT s2)
99103    {
100104        const double * RESTRICT v1l = v1;
101105        const double * RESTRICT v2l = v2;
r30788r30789
106110        }
107111    }
108112
109    void sum2a(const double * v1, const double * v2, const double * v3abs, double &s1, double &s2, double &s3abs)
113    void addmult(double * RESTRICT v1, const double * RESTRICT v2, const double &mult)
110114    {
115        double * RESTRICT v1l = v1;
116        const double * RESTRICT v2l = v2;
117        for (int i=0; i < N(); i++)
118        {
119            v1l[i] += v2l[i] * mult;
120        }
121    }
122
123    void sum2a(const double * RESTRICT v1, const double * RESTRICT v2, const double * RESTRICT v3abs, double & RESTRICT s1, double & RESTRICT s2, double & RESTRICT s3abs)
124    {
111125        const double * RESTRICT v1l = v1;
112126        const double * RESTRICT v2l = v2;
113127        const double * RESTRICT v3l = v3abs;
r30788r30789
133147
134148class ATTR_ALIGNED(64) terms_t
135149{
150    NETLIST_PREVENT_COPYING(terms_t)
151
136152    public:
137153    ATTR_COLD terms_t() {}
138154
r30788r30789
152168    ATTR_HOT inline double *gt() { return m_gt; }
153169    ATTR_HOT inline double *go() { return m_go; }
154170    ATTR_HOT inline double *Idr() { return m_Idr; }
155    ATTR_HOT vector_t *ops() { return m_ops; }
171    ATTR_HOT inline double **other_curanalog() { return m_other_curanalog; }
172    ATTR_HOT vector_ops_t *ops() { return m_ops; }
156173
157174    ATTR_COLD void set_pointers();
158175
176    int m_railstart;
177
159178private:
160179    plinearlist_t<netlist_terminal_t *> m_term;
161180    plinearlist_t<int> m_net_other;
162181    plinearlist_t<double> m_gt;
163182    plinearlist_t<double> m_go;
164183    plinearlist_t<double> m_Idr;
165    vector_t * m_ops;
184    plinearlist_t<double *> m_other_curanalog;
185    vector_ops_t * m_ops;
166186};
167187
168188class netlist_matrix_solver_t : public netlist_device_t
r30788r30789
230250};
231251
232252template <int m_N, int _storage_N>
233class ATTR_ALIGNED(64) netlist_matrix_solver_direct_t: public netlist_matrix_solver_t
253class netlist_matrix_solver_direct_t: public netlist_matrix_solver_t
234254{
235255public:
236256
237   netlist_matrix_solver_direct_t()
238    : netlist_matrix_solver_t()
239    , m_dim(0)
240    {
241       for (int k=0; k<_storage_N; k++)
242           m_A[k] = & m_A_phys[k][0];
243    }
257   netlist_matrix_solver_direct_t(int size);
244258
245   virtual ~netlist_matrix_solver_direct_t() {}
259   virtual ~netlist_matrix_solver_direct_t();
246260
247261   ATTR_COLD virtual void vsetup(netlist_analog_net_t::list_t &nets);
248262   ATTR_COLD virtual void reset() { netlist_matrix_solver_t::reset(); }
r30788r30789
261275
262276    ATTR_HOT virtual double compute_next_timestep(const double);
263277
264    ATTR_ALIGNED(64) double * RESTRICT m_A[_storage_N];
265    ATTR_ALIGNED(64) double m_RHS[_storage_N];
266    ATTR_ALIGNED(64) double m_last_RHS[_storage_N]; // right hand side - contains currents
278    double m_A[_storage_N][((_storage_N + 7) / 8) * 8];
279    double m_RHS[_storage_N];
280    double m_last_RHS[_storage_N]; // right hand side - contains currents
267281
268    terms_t m_terms[_storage_N];
269    terms_t m_rails[_storage_N];
282    terms_t **m_terms;
270283
284    terms_t *m_rails_temp;
285
271286private:
272    ATTR_ALIGNED(64) double m_A_phys[_storage_N][((_storage_N + 7) / 8) * 8];
287    vector_ops_t *m_row_ops[_storage_N + 1];
273288
274289   int m_dim;
275290};
r30788r30789
279294{
280295public:
281296
282   netlist_matrix_solver_gauss_seidel_t()
283      : netlist_matrix_solver_direct_t<m_N, _storage_N>()
297   netlist_matrix_solver_gauss_seidel_t(int size)
298      : netlist_matrix_solver_direct_t<m_N, _storage_N>(size)
284299      , m_gs_fail(0)
285300      , m_gs_total(0)
286301      {}
r30788r30789
300315
301316class ATTR_ALIGNED(64) netlist_matrix_solver_direct1_t: public netlist_matrix_solver_direct_t<1,1>
302317{
318public:
319
320    netlist_matrix_solver_direct1_t()
321      : netlist_matrix_solver_direct_t<1, 1>(1)
322      {}
303323protected:
304324    ATTR_HOT int vsolve_non_dynamic();
305325private:
r30788r30789
307327
308328class ATTR_ALIGNED(64) netlist_matrix_solver_direct2_t: public netlist_matrix_solver_direct_t<2,2>
309329{
330public:
331
332    netlist_matrix_solver_direct2_t()
333      : netlist_matrix_solver_direct_t<2, 2>(2)
334      {}
310335protected:
311336    ATTR_HOT int vsolve_non_dynamic();
312337private:
r30788r30789
352377    netlist_solver_parameters_t m_params;
353378
354379    template <int m_N, int _storage_N>
355    netlist_matrix_solver_t *create_solver(int gs_threshold, bool use_specific);
380    netlist_matrix_solver_t *create_solver(int size, int gs_threshold, bool use_specific);
356381};
357382
358383
trunk/src/emu/netlist/analog/nld_solver.c
r30788r30789
1515
1616#define USE_PIVOT_SEARCH (0)
1717#define VECTALT 1
18#define USE_GABS 0
19#define USE_MATRIX_GS 0
20#define SORP 1.059
1821
1922#define SOLVER_VERBOSE_OUT(x) do {} while (0)
2023//#define SOLVER_VERBOSE_OUT(x) printf x
2124
22/* Commented out for now. Relatively low number of terminals / net makes
25/* Commented out for now. Relatively low number of terminals / nes makes
2326 * the vectorizations this enables pretty expensive
2427 */
2528
2629#if 0
2730#pragma GCC optimize "-ffast-math"
31//#pragma GCC optimize "-funroll-loops"
32#pragma GCC optimize "-funswitch-loops"
2833#pragma GCC optimize "-fvariable-expansion-in-unroller"
29#pragma GCC optimize "-funswitch-loops"
34#pragma GCC optimize "-funsafe-loop-optimizations"
35#pragma GCC optimize "-ftree-loop-if-convert-stores"
36#pragma GCC optimize "-ftree-loop-distribution"
37#pragma GCC optimize "-ftree-loop-im"
38#pragma GCC optimize "-ftree-loop-ivcanon"
39#pragma GCC optimize "-fivopts"
40#pragma GCC optimize "-ftree-parallelize-loops=4"
41#pragma GCC optimize "-fvect-cost-model"
42#pragma GCC optimize "-fvariable-expansion-in-unroller"
3043#endif
3144
32
33static vector_t *create_vector(const int size)
45static vector_ops_t *create_ops(const int size)
3446{
3547    switch (size)
3648    {
3749        case 1:
38            return new vector_imp_t<1>();
50            return new vector_ops_impl_t<1>();
3951        case 2:
40            return new vector_imp_t<2>();
52            return new vector_ops_impl_t<2>();
4153        case 3:
42            return new vector_imp_t<3>();
54            return new vector_ops_impl_t<3>();
4355        case 4:
44            return new vector_imp_t<4>();
56            return new vector_ops_impl_t<4>();
4557        case 5:
46            return new vector_imp_t<5>();
58            return new vector_ops_impl_t<5>();
4759        case 6:
48            return new vector_imp_t<6>();
60            return new vector_ops_impl_t<6>();
4961        case 7:
50            return new vector_imp_t<7>();
62            return new vector_ops_impl_t<7>();
5163        case 8:
52            return new vector_imp_t<8>();
64            return new vector_ops_impl_t<8>();
65        case 9:
66            return new vector_ops_impl_t<9>();
67        case 10:
68            return new vector_ops_impl_t<10>();
69        case 11:
70            return new vector_ops_impl_t<11>();
71        case 12:
72            return new vector_ops_impl_t<12>();
5373        default:
54            return new vector_imp_t<0>(size);
74            return new vector_ops_impl_t<0>(size);
5575    }
5676}
5777
r30788r30789
6282    m_gt.add(0.0);
6383    m_go.add(0.0);
6484    m_Idr.add(0.0);
85    m_other_curanalog.add(NULL);
6586}
6687
6788ATTR_COLD void terms_t::set_pointers()
r30788r30789
7192        m_term[i]->m_gt1 = &m_gt[i];
7293        m_term[i]->m_go1 = &m_go[i];
7394        m_term[i]->m_Idr1 = &m_Idr[i];
95        m_other_curanalog[i] = &m_term[i]->m_otherterm->net().as_analog().m_cur_Analog;
7496    }
7597
76    m_ops = create_vector(m_gt.count());
98    m_ops = create_ops(m_gt.count());
7799}
78100
79101// ----------------------------------------------------------------------------------------
r30788r30789
172194   }
173195}
174196
197// ----------------------------------------------------------------------------------------
198// netlist_matrix_solver_direct
199// ----------------------------------------------------------------------------------------
200
175201template <int m_N, int _storage_N>
202netlist_matrix_solver_direct_t<m_N, _storage_N>::netlist_matrix_solver_direct_t(int size)
203: netlist_matrix_solver_t()
204, m_dim(size)
205{
206    m_terms = new terms_t *[N()];
207    m_rails_temp = new terms_t[N()];
208
209    for (int k = 0; k < N(); k++)
210    {
211        m_terms[k] = new terms_t;
212        m_row_ops[k] = create_ops(k);
213    }
214    m_row_ops[N()] = create_ops(N());
215}
216
217template <int m_N, int _storage_N>
218netlist_matrix_solver_direct_t<m_N, _storage_N>::~netlist_matrix_solver_direct_t()
219{
220    for (int k=0; k<_storage_N; k++)
221    {
222        //delete[] m_A[k];
223    }
224    //delete[] m_last_RHS;
225    //delete[] m_RHS;
226    delete[] m_terms;
227    delete[] m_rails_temp;
228    //delete[] m_row_ops;
229
230}
231
232template <int m_N, int _storage_N>
176233ATTR_HOT double netlist_matrix_solver_direct_t<m_N, _storage_N>::compute_next_timestep(const double hn)
177234{
178235    double new_solver_timestep = m_params.m_max_timestep;
r30788r30789
221278    for (netlist_analog_output_t * const *p = m_inps.first(); p != NULL; p = m_inps.next(p))
222279        if ((*p)->m_proxied_net->m_last_Analog != (*p)->m_proxied_net->m_cur_Analog)
223280            (*p)->set_Q((*p)->m_proxied_net->m_cur_Analog);
224#if 1
281
225282    for (int k = 0; k < m_nets.count(); k++)
226283    {
227284        netlist_analog_net_t *p= m_nets[k];
228285        p->m_last_Analog = p->m_cur_Analog;
229286    }
230#else
231        for (netlist_analog_output_t * const *p = m_inps.first(); p != NULL; p = m_inps.next(p))
232        {
233            if ((*p)->m_proxied_net->m_last_Analog != (*p)->m_proxied_net->m_cur_Analog)
234                (*p)->m_proxied_net->m_last_Analog = (*p)->m_proxied_net->m_cur_Analog;
235        }
236#endif
237287}
238288
239289
r30788r30789
331381   return next_time_step;
332382}
333383
334__attribute__ ((noinline)) static double tx(double * ATTR_ALIGN t  , const int &N)
335{
336    double tmp=0.0;
337    for (int k = 0; k<N; k++)
338        tmp += t[k] ;
339    return tmp;
340}
341
342void ttt()
343{
344    //typedef int  tt ;
345    static double *t = (double *) malloc(128*8);
346    for (int k = 0; k<128; k++)
347        t[k] = k;
348    double tmp;
349    tmp = tx(t, 16);
350    printf("t[0] %p %f\n", &t[0], t[0]);
351    printf("t[1] %p %f\n", &t[1], t[1]);
352    printf("%f\n", tmp);
353    free(t);
354
355}
356
357384template <int m_N, int _storage_N>
358385void netlist_matrix_solver_gauss_seidel_t<m_N, _storage_N>::log_stats()
359386{
r30788r30789
363390    printf("       ==> %d nets\n", this->N()); //, (*(*groups[i].first())->m_core_terms.first())->name().cstr());
364391    printf("       has %s elements\n", this->is_dynamic() ? "dynamic" : "no dynamic");
365392    printf("       has %s elements\n", this->is_timestep() ? "timestep" : "no timestep");
366    printf("       %10d invocations (%6d Hz)  %10d gs fails (%6.2f%%) %4.1f average\n",
393    printf("       %10d invocations (%6d Hz)  %10d gs fails (%6.2f%%) %6.3f average\n",
367394            this->m_calculations,
368395            this->m_calculations * 10 / (int) (this->netlist().time().as_double() * 10.0),
369396            this->m_gs_fail,
370397            100.0 * (double) this->m_gs_fail / (double) this->m_calculations,
371398            (double) this->m_gs_total / (double) this->m_calculations);
372    ttt();
373
374399#endif
375400}
376401
r30788r30789
391416{
392417    if (term->m_otherterm->net().isRailNet())
393418    {
394        //m_nets[k].m_rails.add(pterm);
395        m_rails[k].add(term, -1);
419        m_rails_temp[k].add(term, -1);
396420    }
397421    else
398422    {
399423        int ot = get_net_idx(&term->m_otherterm->net());
400424        if (ot>=0)
401425        {
402            m_terms[k].add(term, ot);
426            m_terms[k]->add(term, ot);
403427            SOLVER_VERBOSE_OUT(("Net %d Term %s %f %f\n", k, terms[i]->name().cstr(), terms[i]->m_gt, terms[i]->m_go));
404428        }
405429        /* Should this be allowed ? */
406430        else // if (ot<0)
407431        {
408           m_rails[k].add(term, ot);
432           m_rails_temp[k].add(term, ot);
409433           netlist().error("found term with missing othernet %s\n", term->name().cstr());
410434        }
411435    }
r30788r30789
415439template <int m_N, int _storage_N>
416440ATTR_COLD void netlist_matrix_solver_direct_t<m_N, _storage_N>::vsetup(netlist_analog_net_t::list_t &nets)
417441{
418    m_dim = nets.count();
419442
443    if (m_dim < nets.count())
444        netlist().error("Dimension %d less than %d", m_dim, nets.count());
445
420446    for (int k = 0; k < N(); k++)
421447    {
422        m_terms[k].clear();
423        m_rails[k].clear();
448        m_terms[k]->clear();
449        m_rails_temp[k].clear();
424450    }
425451
426452    netlist_matrix_solver_t::setup(nets);
427453
428454    for (int k = 0; k < N(); k++)
429455    {
430        m_terms[k].set_pointers();
431        m_rails[k].set_pointers();
456        m_terms[k]->m_railstart = m_terms[k]->count();
457        for (int i = 0; i < m_rails_temp[k].count(); i++)
458            this->m_terms[k]->add(m_rails_temp[k].terms()[i], m_rails_temp[k].net_other()[i]);
459
460        m_rails_temp[k].clear(); // no longer needed
461        m_terms[k]->set_pointers();
432462    }
433463
464#if 1
465
466    /* Sort in descending order by number of connected matrix voltages.
467     * The idea is, that for Gauss-Seidel algo the first voltage computed
468     * depends on the greatest number of previous voltages thus taking into
469     * account the maximum amout of information.
470     *
471     * This actually improves performance on popeye slightly. Average
472     * GS computations reduce from 2.509 to 2.370
473     *
474     * Smallest to largest : 2.613
475     * Unsorted            : 2.509
476     * Largest to smallest : 2.370
477     *
478     * Sorting as a general matrix pre-conditioning is mentioned in
479     * literature but I have found no articles about Gauss Seidel.
480     *
481     */
482
483
484    for (int k = 0; k < N() / 2; k++)
485        for (int i = 0; i < N() - 1; i++)
486        {
487            if (m_terms[i]->m_railstart < m_terms[i+1]->m_railstart)
488            {
489                std::swap(m_terms[i],m_terms[i+1]);
490                m_nets.swap(i, i+1);
491            }
492        }
493
494    for (int k = 0; k < N(); k++)
495    {
496        int *other = m_terms[k]->net_other();
497        for (int i = 0; i < m_terms[k]->count(); i++)
498            if (other[i] != -1)
499                other[i] = get_net_idx(&m_terms[k]->terms()[i]->m_otherterm->net());
500    }
501
502#endif
503
434504}
435505
436506template <int m_N, int _storage_N>
437507ATTR_HOT void netlist_matrix_solver_direct_t<m_N, _storage_N>::build_LE()
438508{
439    for (int k=0; k < _storage_N; k++)
440        for (int i=0; i < _storage_N; i++)
509#if 0
510    for (int k=0; k < N(); k++)
511        for (int i=0; i < N(); i++)
441512            m_A[k][i] = 0.0;
513#endif
442514
443515    for (int k = 0; k < N(); k++)
444516   {
517        for (int i=0; i < N(); i++)
518            m_A[k][i] = 0.0;
519
445520        double rhsk = 0.0;
446521        double akk  = 0.0;
447522        {
448            const int terms_count = m_terms[k].count();
449            const double *gt = m_terms[k].gt();
450            const double *Idr = m_terms[k].Idr();
523            const int terms_count = m_terms[k]->count();
524            const double * RESTRICT gt = m_terms[k]->gt();
525            const double * RESTRICT go = m_terms[k]->go();
526            const double * RESTRICT Idr = m_terms[k]->Idr();
451527#if VECTALT
452528
453529            for (int i = 0; i < terms_count; i++)
454530            {
455                //printf("A %d %d %s %f %f\n",t.net_this, t.net_other, t.term->name().cstr(), t.term->m_gt, t.term->m_go);
456
457531                rhsk = rhsk + Idr[i];
458532                akk = akk + gt[i];
459                //m_A[k][net_other[i]] += -go[i];
460533            }
461534#else
462            m_terms[k].ops()->sum2(Idr, gt, rhsk, akk);
463//                    rhsk += sum(m_terms[k].Idr(), terms_count);
464//            akk += sum(m_terms[k].gt(), terms_count);
535            const netlist_terminal_t * const * terms = this->m_terms[k]->terms();
536            m_terms[k]->ops()->sum2(Idr, gt, rhsk, akk);
465537#endif
466        }
467        {
468            const int rails_count = m_rails[k].count();
469            const netlist_terminal_t * const *rails = m_rails[k].terms();
470            const double *gt = m_rails[k].gt();
471            const double *Idr = m_rails[k].Idr();
472#if VECTALT
473
474            for (int i = 0; i < rails_count; i++)
538            double * const * RESTRICT other_cur_analog = m_terms[k]->other_curanalog();
539            for (int i = m_terms[k]->m_railstart; i < terms_count; i++)
475540            {
476                rhsk = rhsk + Idr[i];
477                akk = akk + gt[i];
541                //rhsk = rhsk + go[i] * terms[i]->m_otherterm->net().as_analog().Q_Analog();
542                rhsk = rhsk + go[i] * *other_cur_analog[i];
478543            }
479#else
480            m_rails[k].ops()->sum2(Idr, gt, rhsk, akk);
481            //rhsk += sum(m_rails[k].Idr(), rails_count);
482            //akk += sum(m_rails[k].gt(), rails_count);
483#endif
484            const double *go = m_rails[k].go();
485            for (int i = 0; i < rails_count; i++)
486            {
487                rhsk = rhsk + go[i] * rails[i]->m_otherterm->net().as_analog().Q_Analog();
488            }
489544        }
545#if 0
490546        /*
491547         * Matrix preconditioning with 1.0 / Akk
492548         *
r30788r30789
497553        m_RHS[k] = rhsk * akk;
498554        m_A[k][k] += 1.0;
499555        {
500            const int terms_count = m_terms[k].count();
501            //const netlist_terminal_t * const *terms = m_terms[k].terms();
502            const int *net_other = m_terms[k].net_other();
503            const double *go = m_terms[k].go();
556            const int *net_other = m_terms[k]->net_other();
557            const double *go = m_terms[k]->go();
558            const int railstart =  m_terms[k]->m_railstart;
504559
505            for (int i = 0; i < terms_count; i++)
560            for (int i = 0; i < railstart; i++)
506561            {
507562                m_A[k][net_other[i]] += -go[i] * akk;
508563            }
509564        }
565#else
566        m_RHS[k] = rhsk;
567        m_A[k][k] += akk;
568        {
569            const int * RESTRICT net_other = m_terms[k]->net_other();
570            const double * RESTRICT go = m_terms[k]->go();
571            const int railstart =  m_terms[k]->m_railstart;
572
573            for (int i = 0; i < railstart; i++)
574            {
575                m_A[k][net_other[i]] += -go[i];
576            }
577        }
578#endif
510579    }
511580}
512581
r30788r30789
558627            const double f1 = - m_A[j][i] * f;
559628         if (f1 != 0.0)
560629         {
630#if 0 && VECTALT
561631                for (int k = i + 1; k < kN; k++)
562632                    m_A[j][k] += m_A[i][k] * f1;
633#else
634                // addmult gives some performance increase here...
635                m_row_ops[kN - (i + 1)]->addmult(&m_A[j][i+1], &m_A[i][i+1], f1) ;
636#endif
563637               m_RHS[j] += m_RHS[i] * f1;
564638         }
565639      }
r30788r30789
567641   /* back substitution */
568642   for (int j = kN - 1; j >= 0; j--)
569643   {
570        //__builtin_prefetch(&m_A[j-1][j], 0);
571644      double tmp = 0;
572645
573646      for (int k = j + 1; k < kN; k++)
r30788r30789
726799     * Need something like that for gaussian elimination as well.
727800     */
728801
729#if 0
802#if USE_MATRIX_GS
730803    ATTR_ALIGN double new_v[_storage_N] = { 0.0 };
731804    const int iN = this->N();
732805
r30788r30789
740813    {
741814        new_v[k] = this->m_nets[k]->m_cur_Analog;
742815    }
816
817    // Frobenius norm for (D-L)^(-1)U
818    double frobU;
819    double frobL;
820    double frob;
821    double norm;
743822    do {
744823        resched = false;
745824        double cerr = 0.0;
825        frobU = 0;
826        frobL = 0;
827        frob = 0;
828        norm = 0;
746829
747830        for (int k = 0; k < iN; k++)
748831        {
749832            double Idrive = 0;
750
833            double norm_t = 0;
751834            // Reduction loops need -ffast-math
752835            for (int i = 0; i < iN; i++)
753                Idrive -= this->m_A[k][i] * new_v[i];
836                Idrive += this->m_A[k][i] * new_v[i];
754837
755            const double new_val = (this->m_RHS[k] + Idrive + this->m_A[k][k] * new_v[k]) / this->m_A[k][k];
838            for (int i = 0; i < iN; i++)
839            {
840                if (i < k) frobL += this->m_A[k][i] * this->m_A[k][i] / this->m_A[k][k] /this-> m_A[k][k];
841                if (i > k) frobU += this->m_A[k][i] * this->m_A[k][i] / this->m_A[k][k] / this->m_A[k][k];
842                frob += this->m_A[k][i] * this->m_A[k][i];
843                norm_t += fabs(this->m_A[k][i]);
844            }
756845
846            if (norm_t > norm) norm = norm_t;
847            const double new_val = (this->m_RHS[k] - Idrive + this->m_A[k][k] * new_v[k]) / this->m_A[k][k];
848
757849            const double e = fabs(new_val - new_v[k]);
758850            cerr = (e > cerr ? e : cerr);
759851            new_v[k] = new_val;
r30788r30789
765857        }
766858        resched_cnt++;
767859    } while (resched && (resched_cnt < this->m_params.m_gs_loops));
860    printf("Frobenius %f %f %f %f %f\n", sqrt(frobU), sqrt(frobL), sqrt(frobU * frobL + (double) iN), sqrt(frob), norm);
861    frobU = sqrt(frobU);
862    frobL = sqrt(frobL);
863    printf("Estimate Frobenius %f\n", 1.0 - (1.0 - frobU -frobL) / (1.0 - frobL) ); //        printf("Frobenius %f\n", sqrt(frob / (double) (iN * iN) ));
768864
865
866    this->store(new_v, false);
867
769868    this->m_gs_total += resched_cnt;
770869    if (resched)
771870    {
r30788r30789
778877    else {
779878        this->m_calculations++;
780879
781        this->store(new_v, false);
782
783880        return resched_cnt;
784881    }
785882
r30788r30789
787884    const int iN = this->N();
788885   bool resched = false;
789886   int  resched_cnt = 0;
887   /* some minor SOR helps on typical netlist matrices */
790888
791   /* over-relaxation not really works on these matrices */
792   //const double w = 1.0; //2.0 / (1.0 + sin(3.14159 / (m_nets.count()+1)));
793   //const double w1 = 1.0 - w;
889   /* ideally, we could get an estimate for the spectral radius of
890    * Inv(D - L) * U
891    *
892    * and estimate using
893    *
894    * omega = 2.0 / (1.0 + sqrt(1-rho))
895    */
794896
897    const double ws = SORP; //1.045; //2.0 / (1.0 + /*sin*/(3.14159 * 5.5 / (double) (m_nets.count()+1)));
898
795899   ATTR_ALIGN double w[_storage_N];
796900   ATTR_ALIGN double one_m_w[_storage_N];
797901   ATTR_ALIGN double RHS[_storage_N];
r30788r30789
809913      double RHS_t = 0.0;
810914
811915      {
812           const netlist_terminal_t * const * rails = this->m_rails[k].terms();
813           //const int * othernet = this->m_rails[k].m_othernet;
814           const int rail_count = this->m_rails[k].count();
815           double *gt = this->m_rails[k].gt();
816            const double *go = this->m_rails[k].go();
817            const double *Idr = this->m_rails[k].Idr();
916            const int term_count = this->m_terms[k]->count();
917            const double * RESTRICT gt = this->m_terms[k]->gt();
918            const double * RESTRICT go = this->m_terms[k]->go();
919            const double * RESTRICT Idr = this->m_terms[k]->Idr();
818920#if VECTALT
819            for (int i = 0; i < rail_count; i++)
820            {
821                gtot_t += gt[i];
822                gabs_t += fabs(go[i]);
823                RHS_t += Idr[i];
824                RHS_t += go[i] * rails[i]->m_otherterm->net().as_analog().Q_Analog();
825            }
826#else
827            this->m_rails[k].ops()->sum2a(gt, Idr, go, gtot_t, RHS_t, gabs_t);
828
829            for (int i = 0; i < rail_count; i++)
830                RHS_t += go[i] * rails[i]->m_otherterm->net().as_analog().Q_Analog();
831#endif
832      }
833      {
834            const int term_count = this->m_terms[k].count();
835            const double *gt = this->m_terms[k].gt();
836            const double *go = this->m_terms[k].go();
837            const double *Idr = this->m_terms[k].Idr();
838#if VECTALT
839921            for (int i = 0; i < term_count; i++)
840922            {
841923                gtot_t += gt[i];
842                gabs_t += fabs(go[i]);
924                if (USE_GABS) gabs_t += fabs(go[i]);
843925                RHS_t += Idr[i];
844926            }
845927#else
846            this->m_terms[k].ops()->sum2a(gt, Idr, go, gtot_t, RHS_t, gabs_t);
928            if (USE_GABS)
929                this->m_terms[k]->ops()->sum2a(gt, Idr, go, gtot_t, RHS_t, gabs_t);
930            else
931                this->m_terms[k]->ops()->sum2(gt, Idr, gtot_t, RHS_t);
847932#endif
933            double * const *other_cur_analog = this->m_terms[k]->other_curanalog();
934            for (int i = this->m_terms[k]->m_railstart; i < term_count; i++)
935                //RHS_t += go[i] * terms[i]->m_otherterm->net().as_analog().Q_Analog();
936                RHS_t += go[i] * *other_cur_analog[i];
848937      }
938
849939        RHS[k] = RHS_t;
850940
851        gabs_t *= 0.9; // avoid rounding issues
852      if (gabs_t <= gtot_t)
853      {
854            const double ws = 1.0;
941        //if (fabs(gabs_t - fabs(gtot_t)) > 1e-20)
942        //    printf("%d %e abs: %f tot: %f\n",k, gabs_t / gtot_t -1.0, gabs_t, gtot_t);
943
944        gabs_t *= 0.5; // avoid rounding issues
945        if (!USE_GABS || gabs_t <= gtot_t)
946        {
855947            w[k] = ws / gtot_t;
856948            one_m_w[k] = 1.0 - ws;
857      }
858      else
859      {
949        }
950        else
951        {
952            //printf("abs: %f tot: %f\n", gabs_t, gtot_t);
860953            w[k] = 1.0 / (gtot_t + gabs_t);
861954            one_m_w[k] = 1.0 - 1.0 * gtot_t / (gtot_t + gabs_t);
862      }
955        }
863956
864957   }
865958
r30788r30789
869962
870963      for (int k = 0; k < iN; k++)
871964      {
872            const int * net_other = this->m_terms[k].net_other();
873         const int term_count = this->m_terms[k].count();
874            const double *go = this->m_terms[k].go();
965            const int * RESTRICT net_other = this->m_terms[k]->net_other();
966         const int railstart = this->m_terms[k]->m_railstart;
967            const double * RESTRICT go = this->m_terms[k]->go();
875968            // -msse2 -msse3 -msse4.1 -msse4.2 -mfpmath=sse -ftree-vectorizer-verbose=3 -fprefetch-loop-arrays -ffast-math
876969
877         double Idrive = 0;
970         double Idrive;
878971
879            for (int i = 0; i < term_count; i++)
972            Idrive = 0.0;
973            for (int i = 0; i < railstart; i++)
880974                Idrive = Idrive + go[i] * new_V[net_other[i]];
881975
882976            //double new_val = (net->m_cur_Analog * gabs[k] + iIdr) / (gtot[k]);
r30788r30789
9311025
9321026   register_param("ACCURACY", m_accuracy, 1e-7);
9331027   register_param("GS_LOOPS", m_gs_loops, 9);              // Gauss-Seidel loops
1028    //register_param("GS_THRESHOLD", m_gs_threshold, 99);          // below this value, gaussian elimination is used
9341029    register_param("GS_THRESHOLD", m_gs_threshold, 5);          // below this value, gaussian elimination is used
9351030    register_param("NR_LOOPS", m_nr_loops, 25);             // Newton-Raphson loops
9361031   register_param("PARALLEL", m_parallel, 0);
r30788r30789
10191114}
10201115
10211116template <int m_N, int _storage_N>
1022netlist_matrix_solver_t * NETLIB_NAME(solver)::create_solver(const int gs_threshold, const bool use_specific)
1117netlist_matrix_solver_t * NETLIB_NAME(solver)::create_solver(int size, const int gs_threshold, const bool use_specific)
10231118{
10241119    if (use_specific && m_N == 1)
10251120        return new netlist_matrix_solver_direct1_t();
r30788r30789
10271122        return new netlist_matrix_solver_direct2_t();
10281123    else
10291124    {
1030        if (_storage_N >= gs_threshold)
1031            return new netlist_matrix_solver_gauss_seidel_t<m_N,_storage_N>();
1125        if (size >= gs_threshold)
1126            return new netlist_matrix_solver_gauss_seidel_t<m_N,_storage_N>(size);
10321127        else
1033            return new netlist_matrix_solver_direct_t<m_N, _storage_N>();
1128            return new netlist_matrix_solver_direct_t<m_N, _storage_N>(size);
10341129    }
10351130}
10361131
r30788r30789
10871182      {
10881183#if 1
10891184         case 1:
1090            ms = create_solver<1,1>(gs_threshold, use_specific);
1185            ms = create_solver<1,1>(1, gs_threshold, use_specific);
10911186            break;
10921187         case 2:
1093                ms = create_solver<2,2>(gs_threshold, use_specific);
1188                ms = create_solver<2,2>(2, gs_threshold, use_specific);
10941189            break;
10951190         case 3:
1096                ms = create_solver<3,3>(gs_threshold, use_specific);
1191                ms = create_solver<3,3>(3, gs_threshold, use_specific);
10971192            break;
10981193         case 4:
1099                ms = create_solver<4,4>(gs_threshold, use_specific);
1194                ms = create_solver<4,4>(4, gs_threshold, use_specific);
11001195            break;
11011196         case 5:
1102                ms = create_solver<5,5>(gs_threshold, use_specific);
1197                ms = create_solver<5,5>(5, gs_threshold, use_specific);
11031198            break;
11041199         case 6:
1105                ms = create_solver<6,6>(gs_threshold, use_specific);
1200                ms = create_solver<6,6>(6, gs_threshold, use_specific);
11061201            break;
11071202            case 7:
1108                ms = create_solver<7,7>(gs_threshold, use_specific);
1203                ms = create_solver<7,7>(7, gs_threshold, use_specific);
11091204                break;
11101205            case 8:
1111                ms = create_solver<8,8>(gs_threshold, use_specific);
1206                ms = create_solver<8,8>(8, gs_threshold, use_specific);
11121207                break;
11131208            case 12:
1114                ms = create_solver<12,12>(gs_threshold, use_specific);
1209                ms = create_solver<12,12>(12, gs_threshold, use_specific);
11151210                break;
11161211#endif
1117         default:
1212            default:
11181213            if (net_count <= 16)
11191214            {
1120                   ms = create_solver<0,16>(gs_threshold, use_specific);
1215                   ms = create_solver<0,16>(net_count, gs_threshold, use_specific);
11211216            }
11221217            else if (net_count <= 32)
11231218            {
1124                   ms = create_solver<0,32>(gs_threshold, use_specific);
1219                   ms = create_solver<0,32>(net_count, gs_threshold, use_specific);
11251220            }
11261221            else if (net_count <= 64)
11271222            {
1128                   ms = create_solver<0,64>(gs_threshold, use_specific);
1223                   ms = create_solver<0,64>(net_count, gs_threshold, use_specific);
11291224            }
11301225            else
11311226            {
trunk/src/emu/netlist/plists.h
r30788r30789
116116      }
117117   }
118118
119    ATTR_HOT inline void swap(const int pos1, const int pos2)
120    {
121        assert((pos1>=0) && (pos1<m_count));
122        assert((pos2>=0) && (pos2<m_count));
123        _ListClass tmp = m_list[pos1];
124        m_list[pos1] = m_list[pos2];
125        m_list[pos2] =tmp;
126    }
127
119128   ATTR_HOT inline bool contains(const _ListClass &elem) const
120129   {
121130      for (_ListClass *i = m_list; i < m_list + m_count; i++)

Previous 199869 Revisions Next


© 1997-2024 The MAME Team